help='Enable built-in profiler (0=off, 1=on)') opt = parser.parse_args() # global variables logger.info('Starting new image-classification task:, %s', opt) mx.random.seed(opt.seed) model_name = opt.model dataset_classes = {'mnist': 10, 'cifar10': 10, 'imagenet': 1000, 'dummy': 1000} batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[ opt.dataset] context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] num_gpus = len(context) batch_size *= max(1, num_gpus) lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()] metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)]) def get_model(model, ctx, opt): """Model initialization.""" kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes} if model.startswith('resnet'): kwargs['thumbnail'] = opt.use_thumbnail elif model.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm net = models.get_model(model, **kwargs) if opt.resume: net.load_params(opt.resume) elif not opt.use_pretrained: if model in ['alexnet']:
model_name = opt.model dataset_classes = { 'mnist': 10, 'cifar10': 10, 'imagenet': 1000, 'dummy': 1000, 'sampleimgnet': 200 } batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[ opt.dataset] context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] num_gpus = len(context) batch_size *= max(1, num_gpus) lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()] metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5), CrossEntropy()]) def get_model(model, ctx, opt): """Model initialization.""" kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes} if model.startswith('resnet'): kwargs['thumbnail'] = opt.use_thumbnail elif model.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm net = models.get_model(model, **kwargs) if opt.resume: net.load_params(opt.resume) elif not opt.use_pretrained: if model in ['alexnet']:
def __init__(self): is_pair = True class_labels = ['0', '1'] metric = Accuracy() super(LCQMCTask, self).__init__(class_labels, metric, is_pair)
def __init__(self): is_pair = False class_labels = ['0', '1'] metric = Accuracy() super(ChnSentiCorpTask, self).__init__(class_labels, metric, is_pair)
def __init__(self): is_pair = True class_labels = ['not_entailment', 'entailment'] metric = Accuracy() super(QNLITask, self).__init__(class_labels, metric, is_pair)
def __init__(self): is_pair = True class_labels = ['neutral', 'entailment', 'contradiction'] metric = Accuracy() super(MNLITask, self).__init__(class_labels, metric, is_pair)
def get_metric(): """Get metrics Accuracy""" return Accuracy()
def get_metric(): """Get metrics Accuracy and F1""" metric = CompositeEvalMetric() for child_metric in [Accuracy(), F1()]: metric.add(child_metric) return metric
def __init__(self): is_pair = False class_labels = ['0', '1'] self.metric = Accuracy() super(SSTTask, self).__init__(class_labels, self.metric, is_pair)
def main(): epoches = 32 gpu_id = 7 ctx_list = [mx.gpu(x) for x in [7, 8]] log_interval = 100 batch_size = 32 start_epoch = 0 # trainer_resume = resume + ".states" if resume is not None else None trainer_resume = None resume = None from mxnet.gluon.data.vision import transforms transform_fn = transforms.Compose([ LeftTopPad(dest_shape=(256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/train2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_train2017.json", transforms=transform_fn, feature_hdf5="output/train2017.h5") val_dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/val2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_val2017.json", words2index=dataset.words2index, index2words=dataset.index2words, transforms=transform_fn, feature_hdf5="output/val2017.h5") dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, last_batch="discard") val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) num_words = dataset.words_count # set up logger save_prefix = "output/res50_" logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len).cuda() for name, p in net.named_parameters(): if "bias" in name: p.data.zero_() else: p.data.normal_(0, 0.01) print(name) net = torch.nn.DataParallel(net) if resume is not None: net.collect_params().load(resume, allow_missing=True, ignore_extra=True) logger.info("Resumed form checkpoint {}.".format(resume)) trainer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, net.parameters()), lr=4e-4) criterion = Criterion() accu_top3_metric = TopKAccuracy(top_k=3) accu_top1_metric = Accuracy(name="batch_accu") ctc_loss_metric = Loss(name="ctc_loss") alpha_metric = Loss(name="alpha_loss") batch_bleu = BleuMetric(name="batch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) epoch_bleu = BleuMetric(name="epoch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) btic = time.time() logger.info(batch_size) logger.info(num_words) logger.info(len(dataset.words2index)) logger.info(len(dataset.index2words)) logger.info(dataset.words2index["<PAD>"]) logger.info(val_dataset.words2index["<PAD>"]) logger.info(len(val_dataset.words2index)) for nepoch in range(start_epoch, epoches): if nepoch > 15: trainer.set_learning_rate(4e-5) logger.info("Current lr: {}".format(trainer.param_groups[0]["lr"])) accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() epoch_bleu.reset() batch_bleu.reset() for nbatch, batch in enumerate(tqdm.tqdm(dataloader)): batch = [ Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch ] data, label, label_len = batch label = label.long() label_len = label_len.long() max_len = label_len.max().data.cpu().numpy() net.train() outputs = net(data, label, max_len) predictions, alphas = outputs ctc_loss = criterion(predictions, label, label_len) loss2 = 1.0 * ((1. - alphas.sum(dim=1))**2).mean() ((ctc_loss + loss2) / batch_size).backward() for group in trainer.param_groups: for param in group['params']: if param.grad is not None: param.grad.data.clamp_(-5, 5) trainer.step() if nbatch % 10 == 0: for n, l in enumerate(label_len): l = int(l.data.cpu().numpy()) la = label[n, 1:l].data.cpu().numpy() pred = predictions[n, :(l - 1)].data.cpu().numpy() accu_top3_metric.update(mx.nd.array(la), mx.nd.array(pred)) accu_top1_metric.update(mx.nd.array(la), mx.nd.array(pred)) epoch_bleu.update(la, predictions[n, :].data.cpu().numpy()) batch_bleu.update(la, predictions[n, :].data.cpu().numpy()) ctc_loss_metric.update( None, preds=mx.nd.array([ctc_loss.data.cpu().numpy()]) / batch_size) alpha_metric.update(None, preds=mx.nd.array( [loss2.data.cpu().numpy()])) if nbatch % log_interval == 0 and nbatch > 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in [ epoch_bleu, batch_bleu, accu_top1_metric, accu_top3_metric, ctc_loss_metric, alpha_metric ] ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( nepoch, nbatch, log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() batch_bleu.reset() accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() net.eval() bleu, acc_top1 = validate(net, gpu_id=gpu_id, val_loader=val_loader, train_index2words=dataset.index2words, val_index2words=val_dataset.index2words) save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % ( nepoch, bleu, acc_top1) torch.save(net.module.state_dict(), save_path) torch.save(trainer.state_dict(), save_path + ".states") logger.info("Saved checkpoint to {}.".format(save_path))
def fit(self, itr, ctx, epochs, batch_size, callbacks=None): # ADAM optimizer #opt_params={'learning_rate':0.001, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-08} opt = mx.optimizer.create('adam') # SGD optimizer #opt = mx.optimizer.create('sgd') # AdaDelta optimizer #opt = mx.optimizer.create('adadelta') # initialize parameters # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0 # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers self._net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True) # fetch and broadcast parameters params = self._net.collect_params() # trainer trainer = Trainer(params=params, optimizer=opt, kvstore='device') # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() # train for e in range(epochs): if callbacks is not None: for cb in callbacks: cb.before_epoch(e) # reset evaluation result to initial state metric.reset() # reset the train data iterator. itr.reset() # loop over the train data iterator for i, batch in enumerate(itr): # splits train data into multiple slices along batch_axis # copy each slice into a context data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # splits train label into multiple slices along batch_axis # copy each slice into a context label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] losses = [] # inside training scope with ag.record(): for x, y in zip(data, label): z = self._net(x) # computes softmax cross entropy loss l = loss_fn(z, y) outputs.append(z) losses.append(l) # backpropagate the error for one iteration for l in losses: l.backward() # make one step of parameter update. # trainer needs to know the batch size of data # to normalize the gradient by 1/batch_size trainer.step(batch_size) # updates internal evaluation metric.update(label, outputs) # invoke callbacks after batch if callbacks is not None: for cb in callbacks: cb.after_batch(e, i, batch_size, metric) # invoke callbacks after epoch if callbacks is not None: for cb in callbacks: cb.after_epoch(e, i, batch_size, metric) return metric
'beta2': 0.999, 'epsilon': 1e-08 } opt = mx.optimizer.create('adam', **opt_params) # initialize parameters model.initialize(force_reinit=True, ctx=ctx) # fetch and broadcast parameters params = model.collect_params() if params is not None: hvd.broadcast_parameters(params, root_rank=0) # create DistributedTrainer, a subclass of gluon.Trainer trainer = hvd.DistributedTrainer(params, opt) # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() # train start = time.perf_counter() for epoch in range(1, EPOCHS + 1): # Reset the train data iterator. train_data.reset() for i, batch in enumerate(train_data): if i == 0: tick_0 = time.time() data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) with ag.record(): output = model(data.astype('float32', copy=False)) loss = loss_fn(output, label) loss.backward() trainer.step(BATCH_SIZE)
def get_metric(cls): """Get metrics Accuracy and F1""" metric = CompositeEvalMetric() for child_metric in [Accuracy(), F1(average='micro')]: metric.add(child_metric) return metric
def load_net(param_file="net.params", ctx=cpu(0)): net = SimpleNet() net.load_parameters(param_file, ctx=ctx) return net def get_val_data(transformer, batch_size=128): mnist_valid = gluon.data.vision.FashionMNIST(train=False) valid_data = gluon.data.DataLoader( mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4) return valid_data if __name__ == "__main__": ctx = gpu(0) if context.num_gpus() else cpu(0) net = load_net("net.params", ctx=ctx) valid_data = get_val_data(transformer) val_acc = Accuracy() for data, label in valid_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.predict_mode(): out = net(data) val_acc.update(label, out) print("Accuray: ", val_acc.get()[1])
def __init__(self): is_pair = False class_labels = ['0', '1', '2', '3'] metric = Accuracy() super(Weibo2Task, self).__init__(class_labels, metric, is_pair)