def test_net_sync(net, criterion, sync, nDevices): ctx_list = [mx.cpu(0) for i in range(nDevices)] net = DataParallelModel(net, ctx_list, sync=sync) criterion = DataParallelCriterion(criterion, ctx_list, sync=sync) iters = 10 bs = 2 # train mode for i in range(iters): x = mx.random.uniform(shape=(bs, 1, 28, 28)) t = nd.ones(shape=(bs)) with autograd.record(): y = net(x) loss = criterion(y, t) autograd.backward(loss) # evaluation mode for i in range(iters): x = mx.random.uniform(shape=(bs, 1, 28, 28)) y = net(x) nd.waitall()
def train(num_gpus, batch_size, lr): train_data, test_data = utils.load_data_fashion_mnist(batch_size) ctx = [gpu[i] for i in range(num_gpus)] print('running on', ctx) dev_params = [get_params(params, c) for c in ctx] for epoch in range(5): start = time() for data, label in train_data: train_batch(data, label, dev_params, ctx, lr) nd.waitall() print('Epoch: %d, training time = %.1f sec'%(epoch, time() - start)) # valiting on GPU 0 net = lambda data : lenet(data, dev_params[0]) test_acc = utils.evaluate_accuracy(test_data, net, ctx[0]) print('Validataion Accuracy = %.4f'%(test_acc))
def main(): os.environ[ 'MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '1' #use mxnet autotune cudnn for about 2x speed up #load model and params sym, arg_params, aux_params = mx.model.load_checkpoint( 'models/mobilenet0.25_yolo3_final', 0) executor = sym.simple_bind(ctx=mx.gpu(0), data=(1, 3, 320, 320), grad_req='null', force_rebind=True) executor.copy_params_from(arg_params, aux_params) #warm up for search the best config in cudnn print("warm up for cudnn config......") a = executor.forward(is_train=False, data=mx.nd.zeros((1, 3, 320, 320))) nd.waitall() print("start!") cva = MyDetector(executor) cva.Run()
def __call__(self, param): """Callback to Show speed """ count = param.num_update if self.last_count > count: self.init = False self.last_count = count self.loss_metric.update(param.loss[0]) if self.init: if count % self.frequent == 0: nd.waitall() try: speed = self.frequent * self.batch_size / (time.time() - self.tic) speed_total = speed * self.size except ZeroDivisionError: speed = float('inf') speed_total = float('inf') # summary loss loss_scalar = self.loss_metric.get() self.summary_writer.add_scalar(tag="loss", value=loss_scalar, global_step=param.num_update) loss_str_format = "[%d][%s]:%.2f " % (param.num_epoch, "loss", loss_scalar) self.loss_metric.reset() # summary speed self.summary_writer.add_scalar(tag="speed", value=speed, global_step=param.num_update) self.summary_writer.flush() if self.rank == 0: logging.info( "Iter:%d Rank:%.2f it/sec Total:%.2f it/sec %s", param.num_update, speed, speed_total, loss_str_format) self.tic = time.time() else: self.init = True self.tic = time.time()
def train(num_gpus, batch_size, lr): train_iter, test_iter = gb.load_data_fashion_mnist( batch_size, root="../data/fashion-mnist") ctx = [mx.gpu(i) for i in range(num_gpus)] print("running on:", ctx) gpu_params = [get_params(params, c) for c in ctx] for epoch in range(4): start = time.time() for X, y in train_iter: train_batch(X, y, gpu_params, ctx, lr) nd.waitall() train_time = time.time() - start def net(x): return lenet(x, gpu_params[0]) test_acc = gb.evaluate_accuracy(test_iter, net, ctx[0]) print("epoch %d, time: %.1f sec, test acc: %.2f" % (epoch + 1, train_time, test_acc))
def speed(net, ctx, data_size=(1024, 1024), iterations=1000, warm_up=500): net.hybridize(static_alloc=True) sample = EvalFactory._sample(data_size, ctx[0]) logger.info(f'Warm-up starts for {warm_up} forward passes...') for _ in range(warm_up): with autograd.record(False): net.predict(sample) nd.waitall() logger.info(f'Evaluate inference speed for {iterations} forward passes...') start = time.time() for _ in range(iterations): with autograd.record(False): net.predict(sample) nd.waitall() time_cost = time.time() - start logger.info('Total time: %.2fs, latency: %.2fms, FPS: %.1f' % (time_cost, time_cost / iterations * 1000, iterations / time_cost))
def process(self, images, return_time=False): output = self.model(images)[-1] output['hm'] = output['hm'].sigmoid() output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1. wh = output['wh'] if self.opt.reg_bbox else None reg = output['reg'] if self.opt.reg_offset else None nd.waitall() forward_time = time.time() dets = decode_centernet_3dod(output['hm'], output['rot'], output['dep'], output['dim'], wh=wh, reg=reg, K=self.opt.K) if return_time: return output, dets, forward_time else: return output, dets
def train(num_gpus, batch_size, lr): train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) ctx = [mx.gpu(i) for i in range(num_gpus)] print('running on:', ctx) # 将模型参数复制到num_gpus块显卡的显存上 gpu_params = [get_params(params, c) for c in ctx] for epoch in range(4): start = time.time() for X, y in train_iter: # 对单个小批量进行多GPU训练 train_batch(X, y, gpu_params, ctx, lr) nd.waitall() train_time = time.time() - start def net(x): # 在gpu(0)上验证模型 return lenet(x, gpu_params[0]) test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0]) print('epoch %d, time %.1f sec, test acc %.2f' % (epoch + 1, train_time, test_acc))
def train(self, need_test=False): print("Training process starts from epoch {}...".format( self.resume_epoch)) for epoch in range(self.resume_epoch, self.epochs): self.current_epoch = epoch for _, item in enumerate(self.train_loader): inputs, labels = item inputs = inputs.as_in_context(self.ctx) labels = labels.as_in_context(self.ctx) cls = 0.0 with autograd.record(): # Gradient outputs = self.model(inputs) for _, loss_type in self.loss_functions.items(): cls += loss_type(outputs=outputs, labels=labels, train_total=self.train_total) cls.backward() self.train_total += inputs.shape[0] self.trainer.step(batch_size=inputs.shape[0]) cls = nd.array(cls).asscalar() if self.train_total % self.args.steps_per_log == 0: self.trainer_log.print_batch_log( current_lr=self.lr_scheduler.base_lr, current_epoch=self.current_epoch, epochs=self.epochs, train_total=self.train_total, loss=cls, ) nd.waitall() if (epoch + 1) % self.args.epochs_per_val == 0: if need_test is True: self.test() self.best_accuracy = self.check_point.save_checkpoint_parameters( epoch=self.current_epoch, model=self.model, current_accuracy=self.current_accuracy, best_accuracy=self.best_accuracy) self.trainer_log.log_close()
def speed(self, iterations=1000, warm_up=500): """speed test with hybridized HybridBlock""" self.net.hybridize(static_alloc=True) # warm-up to obtain stable speed print("Warm-up for %d forward passes..." % warm_up) for _ in range(warm_up): with autograd.record(False): self.net.predict(self.sample) nd.waitall() # speed test print("Speed test for %d forward passes..." % iterations) t_start = time.time() for _ in range(iterations): with autograd.record(False): self.net.predict(self.sample) nd.waitall() time_cost = time.time() - t_start return time_cost, (time_cost / iterations ) * 1000 / self.bs, iterations * self.bs / time_cost
def train(X, contents_Y, styles_Y, ctx, lr, max_epochs, lr_decay_epoch): X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y) for i in range(max_epochs): start = time.time() with autograd.record(): contents_Y_hat, styles_Y_hat = extract_features( X, content_layers, style_layers) contents_l, styles_l, tv_l, l = compute_loss( X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram) l.backward() trainer.step(1) nd.waitall() if i % 50 == 0 and i != 0: print('epoch %3d, content loss %.2f, style loss %.2f, ' 'TV loss %.2f, %.2f sec' % (i, nd.add_n(*contents_l).asscalar(), nd.add_n(*styles_l).asscalar(), tv_l.asscalar(), time.time() - start)) if i % lr_decay_epoch == 0 and i != 0: trainer.set_learning_rate(trainer.learning_rate * 0.1) print('change lr to %.1e' % trainer.learning_rate) return X
def process(self, images, return_time=False): output = self.model(images)[-1] output['hm'] = output['hm'].sigmoid() if self.opt.hm_hp and not self.opt.mse_loss: output['hm_hp'] = output['hm_hp'].sigmoid() reg = output['reg'] if self.opt.reg_offset else None hm_hp = output['hm_hp'] if self.opt.hm_hp else None hp_offset = output['hp_offset'] if self.opt.reg_hp_offset else None nd.waitall() forward_time = time.time() if self.opt.flip_test: output['hm'] = (output['hm'][0:1] + flip_tensor(output['hm'][1:2])) / 2 output['wh'] = (output['wh'][0:1] + flip_tensor(output['wh'][1:2])) / 2 output['hps'] = (output['hps'][0:1] + flip_lr_off( output['hps'][1:2], self.flip_idx)) / 2 hm_hp = (hm_hp[0:1] + flip_lr(hm_hp[1:2], self.flip_idx) ) / 2 if hm_hp is not None else None reg = reg[0:1] if reg is not None else None hp_offset = hp_offset[0:1] if hp_offset is not None else None dets = decode_centernet_pose(output['hm'], output['wh'], output['hps'], reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=self.opt.K) if return_time: return output, dets, forward_time else: return output, dets
def train_gpu(self): train_data, valid_data = self.load_data() # 训练和测试数据 ctx = self.model_ctx_gpu print('Running on {}'.format(ctx)) net = self.model() # 模型 net.collect_params().initialize(init=mx.init.Normal(sigma=.1), ctx=ctx) smoothing_constant = .01 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': smoothing_constant}) epochs = 10 for e in range(epochs): start = time() for batch in train_data: self.train_batch(batch, ctx, net, trainer) nd.waitall() # 等待所有异步的任务都终止 print('Epoch %d, training time = %.1f sec' % (e, time() - start)) correct, num = 0.0, 0.0 for batch in valid_data: correct += self.valid_batch(batch, ctx, net) num += batch[0].shape[0] print('\tvalidation accuracy = %.4f' % (correct / num))
def train(self, video_dataset): # Get logger train_logger = get_logger(self.cfg.LOG_DIR, 'train_vis_repr_model_%s' % video_dataset.dataset_info) params_file, params_select = ['vis_repr_model_%s' % video_dataset.dataset_info], ['vis_repr_model'] # Init params self.load(params_file, params_select, train_logger, allow_init=True) # 1. Select params to train model_trainer = trainer.Trainer(self._collect_params(params_select), 'adam', {'wd': 5e-4, 'learning_rate': self.cfg.LR_INIT}) # 2. Train each epoch for e in range(self.cfg.EPOCHS_TRAIN_MAIN): # Train each batch batch_index = 0 while True: # 1. Load data (batch_images, batch_labels, _), finish = video_dataset.get_batch_data_cls(batch_index, self.cfg.BATCH_SIZE_TRAIN_MAIN) x_list, y = utils.io.split_and_load_gpu(self.cfg.CTX, [batch_images], batch_labels) # 2. Record calculation with autograd.record(): pred_y = self.feedforward(x_list) loss_value = utils.loss.loss_mse(pred_y, y) # 3. Backward & update loss_value.backward() nd.waitall() model_trainer.step(batch_size=self.cfg.BATCH_SIZE_TRAIN_MAIN) # Show info train_logger.info(self.get_loss_info( 'Train vis_repr_model - ', e, batch_index, video_dataset.num_data / self.cfg.BATCH_SIZE_TRAIN_MAIN, loss_value)) # Move to next if finish: break else: batch_index += 1 # Schedules self._step_update_learning_rate(e, model_trainer) self._step_save_params(e, params_file, params_select) # 3. Finish train_logger.info("Training accomplished. ")
def evaluate(model, g, features, labels, mask, ctx, batch_size, mini_batch=True): f1 = mx.metric.F1() preds = [] batch_size = batch_size if mini_batch else features.shape[0] dataloader = gluon.data.BatchSampler( gluon.data.SequentialSampler(features.shape[0]), batch_size, 'keep') for batch in dataloader: node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64')) preds.append(model(node_flow, features[batch_nids.as_in_context(ctx)])) nd.waitall() # preds = nd.concat(*preds, dim=0).argmax(axis=1) preds = nd.concat(*preds, dim=0) mask = nd.array(np.where(mask.asnumpy()), ctx=ctx) f1.update(preds=nd.softmax(preds[mask], axis=1).reshape(-3, 0), labels=labels[mask].reshape(-1, )) return f1.get()[1]
def train(params, max_epochs, lr, lr_decay_epoch=200): tic = time() trainer = gluon.Trainer(params, 'sgd', {'learning_rate': lr}) for i in range(max_epochs): x = params.get('generated_image') with autograd.record(): content_py, style_py = extract_features(x.data(), content_layers, style_layers) content_L = sum_loss(content_loss, content_py, content_y, content_weights) style_L = sum_loss(style_loss, style_py, style_y, style_weights) # tv_L = tv_loss(x.data()) loss = content_L + 500 * style_L loss.backward() trainer.step(1) # add sync to avoid large mem usage nd.waitall() if i % 40 == 0: # print('epoch %3d, content %.3f, style %.3f, tv %.3f, time %.1f sec' % ( # i, content_L.asscalar(), style_L.asscalar(), tv_L.asscalar(), time()-tic)) print('epoch %3d, content %.3f, style %.3f, time %.1f sec' % (i, content_L.asscalar(), style_L.asscalar(), time() - tic)) tic = time() if i and i % lr_decay_epoch == 0: lr *= 0.5 trainer.set_learning_rate(lr) print('change lr to ', lr) return params
def train(x, max_epochs, lr, lr_decay_epoch=200): tic = time() for i in range(max_epochs): with autograd.record(): content_py, style_py = extract_features(x, content_layers, style_layers) content_L = sum_loss(content_loss, content_py, content_y, content_weights) style_L = sum_loss(style_loss, style_py, style_y, style_weights) tv_L = tv_weight * tv_loss(x) loss = style_L + content_L + tv_L loss.backward() x.grad[:] /= x.grad.abs().mean() + 1e-8 x[:] -= lr * x.grad # add sync to avoid large mem usage nd.waitall() if i and i % 50 == 0: print( 'batch %3d, content %.2f, style %.2f, ' 'TV %.2f, time %.1f sec' % (i, content_L.asscalar(), style_L.asscalar(), tv_L.asscalar(), time() - tic)) tic = time() canvas, img = postprocess(x) cv2.imwrite('result_%d.jpg' % i, img) if i and i % lr_decay_epoch == 0: lr *= 0.1 print('change lr to ', lr) canvas, img = postprocess(x) cv2.imwrite('result.jpg', img) #plt.imshow(canvas.asnumpy()) #plt.show() return x
def train(x, max_epochs, lr, lr_decay_epoch=200): tic = time() for i in range(max_epochs): with autograd.record(): content_py, style_py = extract_features( x, content_layers, style_layers) content_L = sum_loss( content_loss, content_py, content_y, content_weights) style_L = sum_loss( style_loss, style_py, style_y, style_weights) tv_L = tv_weight * tv_loss(x) loss = style_L + content_L + tv_L loss.backward() x.grad[:] /= x.grad.abs().mean()+1e-8 x[:] -= lr * x.grad # add sync to avoid large mem usage nd.waitall() if i and i % 50 == 0: print('batch %3d, content %.2f, style %.2f, ' 'TV %.2f, time %.1f sec' % ( i, content_L.asscalar(), style_L.asscalar(), tv_L.asscalar(), time()-tic)) tic = time() canvas,img = postprocess(x) cv2.imwrite('result_%d.jpg'%i,img) if i and i % lr_decay_epoch == 0: lr *= 0.1 print('change lr to ', lr) canvas,img = postprocess(x) cv2.imwrite('result.jpg',img) #plt.imshow(canvas.asnumpy()) #plt.show() return x
def train_one_epoch(model, data_loader, trainer, loss_function, ema=None): r""" One train loop. """ total_batchs = data_loader.total_batchs total_loss = 0 step = 0 global global_step for batch_data in data_loader.next_batch(): step += 1 global_step += 1 # add evaluate per EVALUATE_INTERVAL batchs if global_step % EVALUATE_INTERVAL == 0: print('global_step == %d' % (global_step)) print('evaluating dev dataset...') f1_score, em_score = evaluate(model, dataset_type='dev', ema=ema) print('dev f1:' + str(f1_score) + 'em:' + str(em_score)) dev_f1.append([global_step, f1_score]) dev_em.append([global_step, em_score]) context = nd.array([x[0] for x in batch_data]) query = nd.array([x[1] for x in batch_data]) c_mask = context > 0 q_mask = query > 0 context_char = nd.array([x[2] for x in batch_data]) query_char = nd.array([x[3] for x in batch_data]) begin = nd.array([x[4] for x in batch_data]) end = nd.array([x[5] for x in batch_data]) batch_sizes = context.shape[0] context = gluon.utils.split_and_load(data=context, ctx_list=CTX) c_mask = gluon.utils.split_and_load(data=c_mask, ctx_list=CTX) query = gluon.utils.split_and_load(data=query, ctx_list=CTX) q_mask = gluon.utils.split_and_load(data=q_mask, ctx_list=CTX) context_char = gluon.utils.split_and_load(data=context_char, ctx_list=CTX) query_char = gluon.utils.split_and_load(data=query_char, ctx_list=CTX) begin = gluon.utils.split_and_load(data=begin, ctx_list=CTX) end = gluon.utils.split_and_load(data=end, ctx_list=CTX) with autograd.record(): different_ctx_loss = [ loss_function(*model(c, q, cc, qc, cm, qm, b, e)) for c, q, cc, qc, cm, qm, b, e in zip(context, query, context_char, query_char, c_mask, q_mask, begin, end) ] for loss in different_ctx_loss: loss.backward() if global_step == 1: for name, param in model.collect_params().items(): ema.add(name, param.data(CTX[0])) trainer.set_learning_rate(warm_up_lr(global_step)) trainer.allreduce_grads() reset_embedding_grad(model) tmp = [] for name, paramater in model.collect_params().items(): grad = paramater.grad(context[0].context) if name == 'qanet0_embedding0_weight': grad[0:2] += WEIGHT_DECAY * \ paramater.data(context[0].context)[0:2] else: grad += WEIGHT_DECAY * paramater.data(context[0].context) tmp.append(grad) gluon.utils.clip_global_norm(tmp, CLIP_GRADIENT) reset_embedding_grad(model) trainer.update(batch_sizes, ignore_stale_grad=True) for name, param in model.collect_params().items(): ema(name, param.data(CTX[0])) batch_loss = .0 for loss in different_ctx_loss: batch_loss += loss.mean().asscalar() batch_loss /= len(different_ctx_loss) total_loss += batch_loss batch_train_ce.append([global_step, batch_loss]) accum_avg_train_ce.append([global_step, total_loss / step]) print('batch %d/%d, total_loss %.2f, batch_loss %.2f' % (step, total_batchs, total_loss / step, batch_loss), end='\r', flush=True) nd.waitall()
def train(self, batch_size=64, num_epoch=10, eval_metric='acc', eval_metric_params={}, eval_train=False, loss ='softmax_cross_entropy', loss_params={}, optimizer='adam', optimizer_params=(('learning_rate', 0.001),), load_checkpoint=True, checkpoint_period=5, load_pretrained=False, log_period=50, context='gpu', save_attention_image=False, use_teacher_forcing=False, normalize=True, shuffle_data=False, clip_global_grad_norm=None, preprocessing=False, onnx_export=False): num_pus = 1 if context == 'gpu': num_pus = mx.context.num_gpus() if num_pus >= 1: if num_pus == 1: mx_context = [mx.gpu(0)] else: mx_context = [mx.gpu(i) for i in range(num_pus)] else: logging.error("Context argument is '" + context + "'. But no gpu is present in the system.") elif context == 'cpu': mx_context = [mx.cpu()] else: logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.") single_pu_batch_size = int(batch_size/num_pus) if preprocessing: preproc_lib = "CNNPreprocessor_ResNeXt50_executor" train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_preprocessed_data(batch_size, preproc_lib, shuffle_data) else: train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_data(batch_size, shuffle_data) if 'weight_decay' in optimizer_params: optimizer_params['wd'] = optimizer_params['weight_decay'] del optimizer_params['weight_decay'] if 'learning_rate_decay' in optimizer_params: min_learning_rate = 1e-08 if 'learning_rate_minimum' in optimizer_params: min_learning_rate = optimizer_params['learning_rate_minimum'] del optimizer_params['learning_rate_minimum'] optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( optimizer_params['step_size'], factor=optimizer_params['learning_rate_decay'], stop_factor_lr=min_learning_rate) del optimizer_params['step_size'] del optimizer_params['learning_rate_decay'] if normalize: self._net_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std) else: self._net_creator.construct(context=mx_context, batch_size=batch_size) begin_epoch = 0 if load_checkpoint: begin_epoch = self._net_creator.load(mx_context) elif load_pretrained: self._net_creator.load_pretrained_weights(mx_context) else: if os.path.isdir(self._net_creator._model_dir_): shutil.rmtree(self._net_creator._model_dir_) self._networks = self._net_creator.networks try: os.makedirs(self._net_creator._model_dir_) except OSError: if not os.path.isdir(self._net_creator._model_dir_): raise if optimizer == "adamw": trainers = [mx.gluon.Trainer(network.collect_params(), AdamW.AdamW(**optimizer_params)) for network in self._networks.values() if len(network.collect_params().values()) != 0] else: trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in self._networks.values() if len(network.collect_params().values()) != 0] margin = loss_params['margin'] if 'margin' in loss_params else 1.0 sparseLabel = loss_params['sparse_label'] if 'sparse_label' in loss_params else True ignore_indices = [loss_params['ignore_indices']] if 'ignore_indices' in loss_params else [] loss_axis = loss_params['loss_axis'] if 'loss_axis' in loss_params else -1 batch_axis = loss_params['batch_axis'] if 'batch_axis' in loss_params else 0 if loss == 'softmax_cross_entropy': fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else False loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss(axis=loss_axis, from_logits=fromLogits, sparse_label=sparseLabel, batch_axis=batch_axis) elif loss == 'softmax_cross_entropy_ignore_indices': fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else False loss_function = SoftmaxCrossEntropyLossIgnoreIndices(axis=loss_axis, ignore_indices=ignore_indices, from_logits=fromLogits, sparse_label=sparseLabel, batch_axis=batch_axis) elif loss == 'sigmoid_binary_cross_entropy': loss_function = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss() elif loss == 'cross_entropy': loss_function = CrossEntropyLoss(axis=loss_axis, sparse_label=sparseLabel, batch_axis=batch_axis) elif loss == 'dice_loss': loss_weight = loss_params['loss_weight'] if 'loss_weight' in loss_params else None loss_function = DiceLoss(axis=loss_axis, weight=loss_weight, sparse_label=sparseLabel, batch_axis=batch_axis) elif loss == 'softmax_cross_entropy_ignore_label': loss_weight = loss_params['loss_weight'] if 'loss_weight' in loss_params else None loss_ignore_label = loss_params['loss_ignore_label'] if 'loss_ignore_label' in loss_params else None loss_function = SoftmaxCrossEntropyLossIgnoreLabel(axis=loss_axis, ignore_label=loss_ignore_label, weight=loss_weight, batch_axis=batch_axis) elif loss == 'l2': loss_function = mx.gluon.loss.L2Loss() elif loss == 'l1': loss_function = mx.gluon.loss.L1Loss() elif loss == 'huber': rho = loss_params['rho'] if 'rho' in loss_params else 1 loss_function = mx.gluon.loss.HuberLoss(rho=rho) elif loss == 'hinge': loss_function = mx.gluon.loss.HingeLoss(margin=margin) elif loss == 'squared_hinge': loss_function = mx.gluon.loss.SquaredHingeLoss(margin=margin) elif loss == 'logistic': labelFormat = loss_params['label_format'] if 'label_format' in loss_params else 'signed' loss_function = mx.gluon.loss.LogisticLoss(label_format=labelFormat) elif loss == 'kullback_leibler': fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else True loss_function = mx.gluon.loss.KLDivLoss(from_logits=fromLogits) elif loss == 'log_cosh': loss_function = LogCoshLoss() else: logging.error("Invalid loss parameter.") loss_function.hybridize() tic = None avg_speed = 0 n = 0 for epoch in range(begin_epoch, begin_epoch + num_epoch): if shuffle_data: if preprocessing: preproc_lib = "CNNPreprocessor_ResNeXt50_executor" train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_preprocessed_data(batch_size, preproc_lib, shuffle_data) else: train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_data(batch_size, shuffle_data) global_loss_train = 0.0 train_batches = 0 loss_total = 0 train_iter.reset() for batch_i, batch in enumerate(train_iter): with autograd.record(): labels = [gluon.utils.split_and_load(batch.label[i], ctx_list=mx_context, even_split=False) for i in range(1)] data_ = gluon.utils.split_and_load(batch.data[0], ctx_list=mx_context, even_split=False) predictions_ = [mx.nd.zeros((single_pu_batch_size, 1000,), ctx=context) for context in mx_context] nd.waitall() lossList = [] for i in range(num_pus): lossList.append([]) net_ret = [self._networks[0](data_[i]) for i in range(num_pus)] predictions_ = [net_ret[i][0][0] for i in range(num_pus)] [lossList[i].append(loss_function(predictions_[i], labels[0][i])) for i in range(num_pus)] losses = [0]*num_pus for i in range(num_pus): for element in lossList[i]: losses[i] = losses[i] + element for loss in losses: loss.backward() loss_total += loss.sum().asscalar() global_loss_train += loss.sum().asscalar() train_batches += 1 if clip_global_grad_norm: grads = [] for network in self._networks.values(): grads.extend([param.grad(mx_context) for param in network.collect_params().values()]) gluon.utils.clip_global_norm(grads, clip_global_grad_norm) for trainer in trainers: trainer.step(batch_size) if tic is None: tic = time.time() else: if batch_i % log_period == 0: try: speed = log_period * batch_size / (time.time() - tic) except ZeroDivisionError: speed = float("inf") loss_avg = loss_total / (batch_size * log_period) loss_total = 0 logging.info("Epoch[%d] Batch[%d] Speed: %.2f samples/sec Loss: %.5f" % (epoch, batch_i, speed, loss_avg)) avg_speed += speed n += 1 tic = time.time() global_loss_train /= (train_batches * batch_size) tic = None if eval_train: train_iter.batch_size = single_pu_batch_size train_iter.reset() metric = mx.metric.create(eval_metric, **eval_metric_params) for batch_i, batch in enumerate(train_iter): labels = [batch.label[i].as_in_context(mx_context[0]) for i in range(1)] data_ = batch.data[0].as_in_context(mx_context[0]) predictions_ = mx.nd.zeros((single_pu_batch_size, 1000,), ctx=mx_context[0]) nd.waitall() lossList = [] outputs = [] attentionList = [] net_ret = self._networks[0](data_) predictions_ = net_ret[0][0] outputs.append(predictions_) lossList.append(loss_function(predictions_, labels[0])) if save_attention_image == "True": import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt logging.getLogger('matplotlib').setLevel(logging.ERROR) if(os.path.isfile('src/test/resources/training_data/Show_attend_tell/dict.pkl')): with open('src/test/resources/training_data/Show_attend_tell/dict.pkl', 'rb') as f: dict = pickle.load(f) plt.clf() fig = plt.figure(figsize=(15,15)) max_length = len(labels)-1 ax = fig.add_subplot(max_length//3, max_length//4, 1) ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) for l in range(max_length): attention = attentionList[l] attention = mx.nd.slice_axis(attention, axis=0, begin=0, end=1).squeeze() attention_resized = np.resize(attention.asnumpy(), (8, 8)) ax = fig.add_subplot(max_length//3, max_length//4, l+2) if int(labels[l+1][0].asscalar()) > len(dict): ax.set_title("<unk>") elif dict[int(labels[l+1][0].asscalar())] == "<end>": ax.set_title(".") img = ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent()) break else: ax.set_title(dict[int(labels[l+1][0].asscalar())]) img = ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent()) plt.tight_layout() target_dir = 'target/attention_images' if not os.path.exists(target_dir): os.makedirs(target_dir) plt.savefig(target_dir + '/attention_train.png') plt.close() predictions = [] for output_name in outputs: if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1: predictions.append(mx.nd.argmax(output_name, axis=1)) else: predictions.append(output_name) metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))]) train_metric_score = metric.get()[1] else: train_metric_score = 0 global_loss_test = 0.0 test_batches = 0 test_iter.batch_size = single_pu_batch_size test_iter.reset() metric = mx.metric.create(eval_metric, **eval_metric_params) for batch_i, batch in enumerate(test_iter): if True: labels = [batch.label[i].as_in_context(mx_context[0]) for i in range(1)] data_ = batch.data[0].as_in_context(mx_context[0]) predictions_ = mx.nd.zeros((single_pu_batch_size, 1000,), ctx=mx_context[0]) nd.waitall() lossList = [] outputs = [] attentionList = [] net_ret = self._networks[0](data_) predictions_ = net_ret[0][0] outputs.append(predictions_) lossList.append(loss_function(predictions_, labels[0])) if save_attention_image == "True": if not eval_train: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt logging.getLogger('matplotlib').setLevel(logging.ERROR) if(os.path.isfile('src/test/resources/training_data/Show_attend_tell/dict.pkl')): with open('src/test/resources/training_data/Show_attend_tell/dict.pkl', 'rb') as f: dict = pickle.load(f) plt.clf() fig = plt.figure(figsize=(15,15)) max_length = len(labels)-1 ax = fig.add_subplot(max_length//3, max_length//4, 1) ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) for l in range(max_length): attention = attentionList[l] attention = mx.nd.slice_axis(attention, axis=0, begin=0, end=1).squeeze() attention_resized = np.resize(attention.asnumpy(), (8, 8)) ax = fig.add_subplot(max_length//3, max_length//4, l+2) if int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar()) > len(dict): ax.set_title("<unk>") elif dict[int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar())] == "<end>": ax.set_title(".") img = ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent()) break else: ax.set_title(dict[int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar())]) img = ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0)) ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent()) plt.tight_layout() target_dir = 'target/attention_images' if not os.path.exists(target_dir): os.makedirs(target_dir) plt.savefig(target_dir + '/attention_test.png') plt.close() loss = 0 for element in lossList: loss = loss + element global_loss_test += loss.sum().asscalar() test_batches += 1 predictions = [] for output_name in outputs: if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1: predictions.append(mx.nd.argmax(output_name, axis=1)) else: predictions.append(output_name) metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))]) global_loss_test /= (test_batches * single_pu_batch_size) test_metric_name = metric.get()[0] test_metric_score = metric.get()[1] metric_file = open(self._net_creator._model_dir_ + 'metric.txt', 'w') metric_file.write(test_metric_name + " " + str(test_metric_score)) metric_file.close() logging.info("Epoch[%d] Train metric: %f, Test metric: %f, Train loss: %f, Test loss: %f" % (epoch, train_metric_score, test_metric_score, global_loss_train, global_loss_test)) if (epoch+1) % checkpoint_period == 0: for i, network in self._networks.items(): network.save_parameters(self.parameter_path(i) + '-' + str(epoch).zfill(4) + '.params') for i, network in self._networks.items(): network.save_parameters(self.parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params') network.export(self.parameter_path(i) + '_newest', epoch=0) if onnx_export: from mxnet.contrib import onnx as onnx_mxnet input_shapes = [(1,) + d.shape[1:] for _, d in test_iter.data] model_path = self.parameter_path(i) + '_newest' onnx_mxnet.export_model(model_path+'-symbol.json', model_path+'-0000.params', input_shapes, np.float32, model_path+'.onnx') loss_function.export(self.parameter_path(i) + '_newest_loss', epoch=0)
def train(cfg, ctx_lst, project_name, log_interval=5, no_val=False, lr=None, wd=None): wandb.init(job_type='train', dir=my_tools.root_dir(), config=cfg, project=project_name) if lr and wd: wandb.config.lr = lr wandb.config.wd = wd ctx = my_tools.get_contexts(ctx_lst) wandb.config.ctx = ctx data_factory = DataFactory(wandb.config.data_name) model_factory = ModelFactory(wandb.config.model_name) norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm, len(ctx)) model_kwargs = { 'nclass': data_factory.num_class, 'backbone': wandb.config.backbone, 'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls', 'aux': wandb.config.aux, 'crop_size': wandb.config.crop_size, 'base_size': wandb.config.base_size, 'dilate': wandb.config.dilate, 'norm_layer': norm_layer, 'norm_kwargs': norm_kwargs, } net = model_factory.get_model( model_kwargs, resume=wandb.config.resume, lr_mult=wandb.config.lr_mult, backbone_init_manner=wandb.config.backbone_init.get('manner'), backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'), prior_classes=wandb.config.backbone_init.get('prior_classes'), ctx=ctx) if net.symbolize: net.hybridize() num_worker = 0 if platform.system() == 'Windows' else 16 train_set = data_factory.seg_dataset( split='train', # sometimes would be 'trainval' mode='train', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) train_iter = DataLoader(train_set, wandb.config.bs_train, shuffle=True, last_batch='discard', num_workers=num_worker) val_set = data_factory.seg_dataset(split='val', mode='val', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) val_iter = DataLoader(val_set, wandb.config.bs_val, shuffle=False, last_batch='keep', num_workers=num_worker) wandb.config.num_train = len(train_set) wandb.config.num_valid = len(val_set) criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight) criterion.initialize(ctx=ctx) wandb.config.criterion = type(criterion) if wandb.config.optimizer == 'adam': trainer = Trainer(net.collect_params(), 'adam', optimizer_params={ 'learning_rate': wandb.config.lr, 'wd': wandb.config.wd, 'beta1': wandb.config.adam.get('adam_beta1'), 'beta2': wandb.config.adam.get('adam_beta2') }) elif wandb.config.optimizer in ('sgd', 'nag'): scheduler = _lr_scheduler( mode=wandb.config.lr_scheduler, base_lr=wandb.config.lr, target_lr=wandb.config.target_lr, nepochs=wandb.config.epochs, iters_per_epoch=len(train_iter), step_epoch=wandb.config.step.get('step_epoch'), step_factor=wandb.config.step.get('step_factor'), power=wandb.config.poly.get('power')) trainer = Trainer(net.collect_params(), wandb.config.optimizer, optimizer_params={ 'lr_scheduler': scheduler, 'wd': wandb.config.wd, 'momentum': wandb.config.momentum, 'multi_precision': True }) else: raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}") metric = SegmentationMetric(data_factory.num_class) logger = get_logger(name='train', level=10) t_start = my_tools.get_strftime() logger.info(f'Training start: {t_start}') for k, v in wandb.config.items(): logger.info(f'{k}: {v}') logger.info('-----> end hyper-parameters <-----') wandb.config.start_time = t_start best_score = .0 best_epoch = 0 for epoch in range(wandb.config.epochs): train_loss = .0 tbar = tqdm(train_iter) for i, (data, target) in enumerate(tbar): gpu_datas = split_and_load(data, ctx_list=ctx) gpu_targets = split_and_load(target, ctx_list=ctx) with autograd.record(): loss_gpus = [ criterion(*net(gpu_data), gpu_target) for gpu_data, gpu_target in zip(gpu_datas, gpu_targets) ] for loss in loss_gpus: autograd.backward(loss) trainer.step(wandb.config.bs_train) nd.waitall() train_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) tbar.set_description( 'Epoch-%d [training], loss %.5f, %s' % (epoch, train_loss / (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S'))) if (i % log_interval == 0) or (i + 1 == len(train_iter)): wandb.log({ f'train_loss_batch, interval={log_interval}': train_loss / (i + 1) }) wandb.log({ 'train_loss_epoch': train_loss / (len(train_iter)), 'custom_step': epoch }) if not no_val: val_loss = .0 vbar = tqdm(val_iter) for i, (data, target) in enumerate(vbar): gpu_datas = split_and_load(data=data, ctx_list=ctx, even_split=False) gpu_targets = split_and_load(data=target, ctx_list=ctx, even_split=False) loss_gpus = [] for gpu_data, gpu_target in zip(gpu_datas, gpu_targets): gpu_output = net(gpu_data) loss_gpus.append(criterion(*gpu_output, gpu_target)) metric.update(gpu_target, gpu_output[0]) val_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) vbar.set_description( 'Epoch-%d [validation], PA %.4f, mIoU %.4f' % (epoch, metric.get()[0], metric.get()[1])) nd.waitall() pix_acc, mean_iou = metric.get() wandb.log({ 'val_PA': pix_acc, 'val_mIoU': mean_iou, 'val_loss': val_loss / len(val_iter), 'custom_step': epoch }) metric.reset() if mean_iou > best_score: my_tools.save_checkpoint( model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=True) best_score = mean_iou best_epoch = epoch logger.info( f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}') wandb.config.best_epoch = best_epoch my_tools.save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=False)
#8.3.1-CPU和GPU的并行计算 def run(x): return [nd.dot(x, x) for _ in range(10)] #分别在内存和显存上创建NDArray x_cpu = nd.random.uniform(shape=(2000, 2000)) print('x_gpu') x_gpu = nd.random.uniform(shape=(6000, 6000), ctx=mx.gpu(0)) print('dayin') #打印 run(x_cpu) #预热开始 run(x_gpu) nd.waitall() #预热结束 with d2l.Benchmark('Run on CPU.'): run(x_cpu) nd.waitall() with d2l.Benchmark('Then run on GPU.'): run(x_gpu) nd.waitall() #自动并行不同任务 with d2l.Benchmark('Run on both CPU and GPU in parallel.'): run(x_cpu) run(x_gpu) nd.waitall()
def train_net(net, train_iter, valid_iter, batch_size, trainer, ctx, num_epochs, lr_sch, save_prefix): logger.info("===================START TRAINING====================") if use_mxboard: sw = SummaryWriter(logdir='logs', flush_secs=5) cls_loss = gluon.loss.SoftmaxCrossEntropyLoss() cls_acc = mx.metric.Accuracy(name="train acc") top_acc = 0 iter_num = 0 #test_acc,test_loss = test_net(net, valid_iter, ctx) #sw.add_graph(net) #only hybrid block supported param_names = net.collect_params().keys() for epoch in range(num_epochs): train_loss = [] t0 = time.time() if isinstance(train_iter, mx.io.MXDataIter): train_iter.reset() total = 0 trainer.set_learning_rate(lr_sch(epoch)) for batch in train_iter: iter_num += 1 # print("iter ",iter_num," start") if isinstance(batch, mx.io.DataBatch): X, Y = batch.data[0], batch.label[0] #total += X.shape[0] #print(total) else: X, Y = batch #print(X.shape,Y.shape) #print(Y) X = X.as_in_context(ctx) Y = Y.as_in_context(ctx) with autograd.record(True): out = net(X) #out = out.as_in_context(mx.cpu()) loss = cls_loss(out, Y) # print(out.asnumpy()[0]) # print('loss = ',loss.sum().asscalar()) loss.backward() train_loss.append(loss.sum().asscalar()) trainer.step(batch_size) cls_acc.update(Y, out) nd.waitall() #print("iter ",iter_num," end") if use_mxboard: if iter_num % 100 == 0: sw.add_scalar(tag='train_loss', value=loss.mean().asscalar(), global_step=iter_num) sw.add_scalar(tag='train_acc', value=cls_acc.get(), global_step=iter_num) if iter_num % 100 == 0: for name in net.collect_params(): param = net.collect_params()[name] if param.grad_req != "null": sw.add_histogram(tag=name, values=param.grad(), global_step=iter_num, bins=1000) logger.info("epoch {} lr {} {}sec".format(epoch, trainer.learning_rate, time.time() - t0)) train_loss, train_acc = np.mean(train_loss) / batch_size, cls_acc.get() logger.info("\ttrain loss {} {}".format(train_loss, train_acc)) if epoch > 0 and (epoch % 10) == 0: test_acc, test_loss = test_net(net, valid_iter, ctx) if use_mxboard: sw.add_scalar(tag='test_acc', value=test_acc, global_step=epoch) sw.add_scalar(tag='test_loss', value=test_loss, global_step=epoch) if top_acc < test_acc: top_acc = test_acc logger.info('\ttop valid acc {}'.format(test_acc)) if isinstance(net, mx.gluon.nn.HybridSequential) or isinstance( net, mx.gluon.nn.HybridBlock): pf = '{}_{:.3f}.params'.format(save_prefix, top_acc) net.export(pf, epoch) else: net_path = '{}top_acc_{}_{:.3f}.params'.format( save_prefix, epoch, top_acc) net.save_parameters(net_path) if use_mxboard: sw.close()
def test_copy(): a = nd.ones((SMALL_Y, LARGE_X)) b = a.copy() nd.waitall() assert b.shape == a.shape assert b.size == LARGE_SIZE
def train(train_data, net, loss, ctx, global_step, epoch_step, num_epochs, best_F1=0): print("Start training on ", ctx) if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(num_epochs): if epoch < 50: trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.001, 'wd': 1e-3 }) elif epoch < 90: trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.0001, 'wd': 1e-3 }) elif epoch < 120: trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.00001, 'wd': 1e-3 }) else: trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.000001, 'momentum': 0.9, 'wd': 1e-3 }) train_loss, n, = 0.0, 0.0 TP, TN, FP, FN = 0, 0, 0, 0 start = time() for i, batch in enumerate(train_data): data, label, batch_size = get_batch(batch, ctx) losses = [] with autograd.record(): outputs = [net(X) for X in data] losses = [loss(yhat, y) for yhat, y in zip(outputs, label)] for l in losses: l.backward() sw.add_scalar(tag='cross_entropy', value=l.mean().asscalar(), global_step=global_step) global_step += 1 train_loss += sum([l.sum().asscalar() for l in losses]) n += batch_size trainer.step(batch_size) for data, label in test_data: data = data.as_in_context(ctx[0]) label = label.as_in_context(ctx[0]) pred = net(data) nd.waitall() pred = nd.sigmoid(pred) pred = (pred > 0.5).reshape(-1, 256, 256) TPt = nd.sum(pred * label).asscalar() FPt = nd.sum(pred - (pred * label)).asscalar() FNt = nd.sum(label - (pred * label)).asscalar() TNt = nd.sum((1 - pred) * (1 - label)).asscalar() TP = TP + TPt FP = FP + FPt FN = FN + FNt TN = TN + TNt ACC = (TP + TN) / (TP + TN + FP + FN + 1e-15) TPR = TP / (TP + FN + 1e-15) TNR = TN / (FP + TN + 1e-15) PPV = TP / (TP + FP + 1e-15) F1 = 2 * PPV * TPR / (PPV + TPR + 1e-15) sw.add_scalar(tag='test_acc', value=ACC, global_step=epoch_step) sw.add_scalar(tag='test_TPR', value=TPR, global_step=epoch_step) sw.add_scalar(tag='test_TNR', value=TNR, global_step=epoch_step) sw.add_scalar(tag='test_PPV', value=PPV, global_step=epoch_step) sw.add_scalar(tag='F1', value=F1, global_step=epoch_step) epoch_step += 1 print('EPOCH', epoch) print('test_acc=', ACC) print('test_TPR=', TPR) print('test_TNR=', TNR) print('test_PPV=', PPV) print('F1=', F1) if F1 > best_F1: net.save_parameters('u_e1.params') best_F1 = F1 if epoch == 0: sw.add_graph(net) print('train_loss=', train_loss / n) print('time:', time() - start) sw.close() net.export("mynet", epoch)
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if config.train_cfg.param_init: init_func = getattr(mx.init, config.train_cfg.init) net.initialize(init_func(), ctx=ctx, force_reinit=True) else: net.load_parameters(config.train_cfg.param_file, ctx=ctx) summary(net, stat_name, nd.uniform( shape=(1, 3, imgsize, imgsize), ctx=ctx[0])) # net = nn.HybridBlock() net.hybridize() root = config.dir_cfg.dataset train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer_arg = {'learning_rate': config.lr_cfg.lr, 'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch} extra_arg = eval(config.lr_cfg.extra_arg) trainer_arg.update(extra_arg) trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg) if config.train_cfg.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if config.data_cfg.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 # print('start training') sig_state.emit(1) sig_pgbar.emit(0) # signal.emit('Training') for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and config.save_cfg.profiler: profiler.set_state('run') is_profiler_run = True if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard: sw.add_graph(net) lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not config.data_cfg.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not config.data_cfg.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if config.train_cfg.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() if config.save_cfg.tensorboard: sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and config.save_cfg.profiler: nd.waitall() profiler.set_state('stop') profiler.dump() iteration += 1 sig_pgbar.emit(iteration) if check_flag()[0]: sig_state.emit(2) while(check_flag()[0] or check_flag()[1]): if check_flag()[1]: print('stop') return else: time.sleep(5) print('pausing') epoch_time = time.time() - tic train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) # if config.data_cfg.mixup: # train_history.update([acc, 1-val_acc]) # plt.cla() # train_history.plot(save_path='%s/%s_history.png' % # (plot_name, model_name)) # else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time)) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') if config.save_cfg.tensorboard: sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) sig_table.emit([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_writer.writerow([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_file.flush() if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1))
def benchmark(net, X): start = time.time() for i in range(1000): _ = net(X) nd.waitall() return time.time() - start
def _engine_cond(cond_type='scaffold', file_name='datasets/ChEMBL_scaffold.txt', num_scaffolds=734, is_full=False, ckpt_dir='ckpt/scaffold', num_folds=5, fold_id=0, batch_size=50, batch_size_test=100, num_workers=2, k=5, p=0.8, F_e=16, F_h=(32, 64, 128, 128, 256, 256), F_skip=256, F_c=(512, ), Fh_policy=128, activation='relu', N_rnn=3, gpu_ids=(0, 1, 2, 3), lr=1e-3, decay=0.015, decay_step=100, clip_grad=3.0, iterations=30000, summary_step=200): if all([ os.path.isfile(os.path.join(ckpt_dir, _n)) for _n in ['log.out', 'ckpt.params', 'trainer.status'] ]): is_continuous = True else: is_continuous = False if is_full: if cond_type != 'kinase': if cond_type == 'scaffold': cond = data.SparseFP(num_scaffolds) N_C = num_scaffolds elif cond_type == 'prop': cond = data.Delimited() N_C = 2 else: raise ValueError with open(file_name) as f: dataset = data.Lambda(f.readlines(), lambda _x: _x.strip('\n').strip('\r')) # get sampler and loader for training set sampler_train = data.BalancedSampler( cost=[len(l.split('\t')[0]) for l in dataset], batch_size=batch_size) loader_train = data.CMolRNNLoader(dataset, batch_sampler=sampler_train, num_workers=num_workers, k=k, p=p, conditional=cond) loader_test = [] else: cond = data.Delimited() N_C = 2 if all([ os.path.isfile(os.path.join(ckpt_dir, _n)) for _n in ['log.out', 'ckpt.params', 'trainer.status'] ]): is_continuous = True else: is_continuous = False with open(file_name) as f: dataset = data.Lambda(f.readlines(), lambda _x: _x.strip('\n').strip('\r')) # get dataset def _filter(_line, _i): return int(_line.split('\t')[-1]) == _i db_train = data.Lambda(data.Filter( dataset, fn=lambda _x: not _filter(_x, fold_id)), fn=lambda _x: _x[:-2]) db_test = data.Lambda(data.Filter( dataset, fn=lambda _x: _filter(_x, fold_id)), fn=lambda _x: _x[:-2]) # get sampler and loader for test set loader_test = data.CMolRNNLoader(db_test, shuffle=True, num_workers=num_workers, k=k, p=p, conditional=cond, batch_size=batch_size_test) # get sampler and loader for training set loader_train = data.CMolRNNLoader(db_train, shuffle=True, num_workers=num_workers, k=k, p=p, conditional=cond, batch_size=batch_size) # get iterator it_train, it_test = iter(loader_train), iter(loader_test) else: if cond_type != 'kinase': if cond_type == 'scaffold': cond = data.SparseFP(num_scaffolds) N_C = num_scaffolds elif cond_type == 'prop': cond = data.Delimited() N_C = 2 else: raise ValueError if all([ os.path.isfile(os.path.join(ckpt_dir, _n)) for _n in ['log.out', 'ckpt.params', 'trainer.status'] ]): is_continuous = True else: is_continuous = False with open(file_name) as f: dataset = data.Lambda(f.readlines(), lambda _x: _x.strip('\n').strip('\r')) # get dataset db_train = data.KFold(dataset, k=num_folds, fold_id=fold_id, is_train=True) db_test = data.KFold(dataset, k=num_folds, fold_id=fold_id, is_train=False) # get sampler and loader for training set sampler_train = data.BalancedSampler( cost=[len(l.split('\t')[0]) for l in db_train], batch_size=batch_size) loader_train = data.CMolRNNLoader(db_train, batch_sampler=sampler_train, num_workers=num_workers, k=k, p=p, conditional=cond) # get sampler and loader for test set sampler_test = data.BalancedSampler( cost=[len(l.split('\t'[0])) for l in db_test], batch_size=batch_size_test) loader_test = data.CMolRNNLoader(db_test, batch_sampler=sampler_test, num_workers=num_workers, k=k, p=p, conditional=cond) else: cond = data.Delimited() N_C = 2 if all([ os.path.isfile(os.path.join(ckpt_dir, _n)) for _n in ['log.out', 'ckpt.params', 'trainer.status'] ]): is_continuous = True else: is_continuous = False with open(file_name) as f: dataset = data.Lambda(f.readlines(), lambda _x: _x.strip('\n').strip('\r')) # get dataset def _filter(_line, _i): return int(_line.split('\t')[-1]) == _i db_train = data.Lambda(data.Filter( dataset, fn=lambda _x: not _filter(_x, fold_id)), fn=lambda _x: _x[:-2]) db_test = data.Lambda(data.Filter( dataset, fn=lambda _x: _filter(_x, fold_id)), fn=lambda _x: _x[:-2]) # get sampler and loader for training set loader_train = data.CMolRNNLoader(db_train, shuffle=True, num_workers=num_workers, k=k, p=p, conditional=cond, batch_size=batch_size) # get sampler and loader for test set loader_test = data.CMolRNNLoader(db_test, shuffle=True, num_workers=num_workers, k=k, p=p, conditional=cond, batch_size=batch_size_test) # get iterator it_train, it_test = iter(loader_train), iter(loader_test) # build model if not is_continuous: configs = { 'N_C': N_C, 'F_e': F_e, 'F_h': F_h, 'F_skip': F_skip, 'F_c': F_c, 'Fh_policy': Fh_policy, 'activation': activation, 'rename': True, 'N_rnn': N_rnn } with open(os.path.join(ckpt_dir, 'configs.json'), 'w') as f: json.dump(configs, f) else: with open(os.path.join(ckpt_dir, 'configs.json')) as f: configs = json.load(f) model = models.CVanillaMolGen_RNN(get_mol_spec().num_atom_types, get_mol_spec().num_bond_types, D=2, **configs) ctx = [mx.gpu(i) for i in gpu_ids] model.collect_params().initialize(mx.init.Xavier(), force_reinit=True, ctx=ctx) if not is_continuous: if cond_type == 'kinase': model.load_params(os.path.join(ckpt_dir, 'ckpt.params.bk'), ctx=ctx, allow_missing=True) else: model.load_params(os.path.join(ckpt_dir, 'ckpt.params'), ctx=ctx) # construct optimizer opt = mx.optimizer.Adam(learning_rate=lr, clip_gradient=clip_grad) trainer = gluon.Trainer(model.collect_params(), opt) if is_continuous: trainer.load_states(os.path.join(ckpt_dir, 'trainer.status')) if not is_continuous: t0 = time.time() global_counter = 0 else: with open(os.path.join(ckpt_dir, 'log.out')) as f: records = f.readlines() if records[-1] != 'Training finished\n': final_record = records[-1] else: final_record = records[-2] count, t_final = int(final_record.split('\t')[0]), float( final_record.split('\t')[1]) t0 = time.time() - t_final * 60 global_counter = count with open(os.path.join(ckpt_dir, 'log.out'), mode='w' if not is_continuous else 'a') as f: if not is_continuous: f.write('step\ttime(h)\tloss\tlr\n') while True: global_counter += 1 try: inputs = [next(it_train) for _ in range(len(gpu_ids))] except StopIteration: it_train = iter(loader_train) inputs = [next(it_train) for _ in range(len(gpu_ids))] # move to gpu inputs = [ data.CMolRNNLoader.from_numpy_to_tensor(input_i, j) for j, input_i in zip(gpu_ids, inputs) ] with autograd.record(): loss = [(model(*input_i)).as_in_context(mx.gpu(gpu_ids[0])) for input_i in inputs] loss = sum(loss) / len(gpu_ids) loss.backward() nd.waitall() gc.collect() trainer.step(batch_size=1) if global_counter % decay_step == 0: trainer.set_learning_rate(trainer.learning_rate * (1.0 - decay)) if global_counter % summary_step == 0: if is_full: loss = np.asscalar((sum(loss) / len(gpu_ids)).asnumpy()) else: del loss, inputs gc.collect() try: inputs = [next(it_test) for _ in range(len(gpu_ids))] except StopIteration: it_test = iter(loader_test) inputs = [next(it_test) for _ in range(len(gpu_ids))] with autograd.predict_mode(): # move to gpu inputs = [ data.CMolRNNLoader.from_numpy_to_tensor( input_i, j) for j, input_i in zip(gpu_ids, inputs) ] loss = [ (model(*input_i)).as_in_context(mx.gpu(gpu_ids[0])) for input_i in inputs ] loss = np.asscalar( (sum(loss) / len(gpu_ids)).asnumpy()) model.save_params(os.path.join(ckpt_dir, 'ckpt.params')) trainer.save_states(os.path.join(ckpt_dir, 'trainer.status')) f.write('{}\t{}\t{}\t{}\n'.format(global_counter, float(time.time() - t0) / 60, loss, trainer.learning_rate)) f.flush() del loss, inputs gc.collect() if global_counter >= iterations: break # save before exit model.save_params(os.path.join(ckpt_dir, 'ckpt.params')) trainer.save_states(os.path.join(ckpt_dir, 'trainer.status')) f.write('Training finished\n')
target.reshape(-1, )) L = batch_L / data.size hiddens = h L.backward() grads = [p.grad() for p in net.collect_params().values()] gluon.utils.clip_global_norm(grads, grad_clip) trainer.step(1) if mpi_rank == 0: params_prev = [ param.data().copy() for param in net.collect_params().values() ] else: params_prev = None nd.waitall() # broadcast params_prev = mpi_comm.bcast(params_prev, root=0) for param, param_prev in zip(net.collect_params().values(), params_prev): param.set_data(param_prev) if mpi_rank == 0: worker_list = list(range(mpi_size)) training_file_index_list = [i for i in range(len(training_files))] alpha = args.alpha randperm_choice_list = [] randperm_list = [i for i in range(args.nsplit)]
def test_expand_dims(): a = nd.array(np.ones((SMALL_Y, LARGE_X))) b = nd.expand_dims(a, axis=1) nd.waitall() assert b.shape == (SMALL_Y, 1, LARGE_X)
def benchmark(net, x): start = time.time() for i in range(1000): _ = net(x) nd.waitall() # 等待所有计算完成方便计时 return time.time() - start