def test_mobilenetv2_quant(): set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") config = config_ascend_quant print("training configure: {}".format(config)) epoch_size = config.epoch_size # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth( smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=dataset_path, config=config, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) # get learning rate lr = Tensor(get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") monitor = Monitor(lr_init=lr.asnumpy(), step_threshold=config.step_threshold) callback = [monitor] model.train(epoch_size, dataset, callbacks=callback, dataset_sink_mode=False) print("============== End Training ==============") export_time_used = 650 train_time = monitor.step_mseconds print('train_time_used:{}'.format(train_time)) assert train_time < export_time_used expect_avg_step_loss = 2.32 avg_step_loss = np.mean(np.array(monitor.losses)) print("average step loss:{}".format(avg_step_loss)) assert avg_step_loss < expect_avg_step_loss
def main(): # init seed set_seed(1) # set context device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) args = parse_args() # update config reset_config(config, args) # init model model = get_pose_net(config, is_train=False) # load parameters ckpt_name = config.TEST.MODEL_FILE print('loading model ckpt from {}'.format(ckpt_name)) load_param_into_net(model, load_checkpoint(ckpt_name)) # Data loading code valid_dataset, _ = keypoint_dataset( config, bbox_file=config.TEST.COCO_BBOX_FILE, train_mode=False, num_parallel_workers=args.workers, ) # evaluate on validation set validate(config, valid_dataset, model, ckpt_name.split('.')[0])
def create_dataset(data_path, is_train=True, batch_size=32): # import import mindspore.common.dtype as mstype import mindspore.dataset.engine as de import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset data_path = os.path.join( data_path, "cifar-10-batches-bin" if is_train else "cifar-10-verify-bin") ds = de.Cifar10Dataset(data_path, shuffle=True, num_shards=num_shards, shard_id=shard_id, num_parallel_workers=8, num_samples=None) # define ops comps_ops = list() # train or val if is_train: comps_ops.append(C.RandomCrop((32, 32), (4, 4, 4, 4))) comps_ops.append(C.RandomHorizontalFlip(prob=0.5)) comps_ops.append(C.Resize((224, 224))) comps_ops.append(C.Rescale(1 / 255.0, 0.)) comps_ops.append( C.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])) comps_ops.append(C.HWC2CHW()) # map ops ds = ds.map(input_columns=["image"], operations=comps_ops, num_parallel_workers=8) ds = ds.map(input_columns=["label"], operations=C2.TypeCast(mstype.int32), num_parallel_workers=8) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=is_train) ds = ds.repeat(count=1) return ds
def train_and_eval(config): """ train_and_eval """ set_seed(1000) data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if int(get_rank()) == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def test_using_diffserent_seed_for_initializer(): set_seed(0) net1 = ParameterNet() net1.init_parameters_data() set_seed(1) net2 = ParameterNet() net2.init_parameters_data() for key in net1.parameters_dict(): if key not in net2.parameters_dict(): assert False else: assert not allclose(net1.parameters_dict()[key].data.asnumpy(), net2.parameters_dict()[key].data.asnumpy())
def create_dataset(batch_size=32): # import import mindspore.dataset.engine as de import numpy as np from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset class BaseDataset: def __init__(self): self.samples = [] self._load_samples() def __getitem__(self, index): sample = self.samples[index] return sample[0], sample[1] def _load_samples(self): self.samples.append([ np.random.rand(3, 4, 5).astype(np.float32), np.random.randint(10, size=()).astype(np.int32) ]) def __len__(self): return len(self.samples) # define dataset ds = de.GeneratorDataset(source=BaseDataset(), column_names=['image', 'label'], num_shards=num_shards, shard_id=shard_id) # map ops ds = ds.map(input_columns=["image"], operations=lambda img: img, num_parallel_workers=8) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=False) ds = ds.repeat(count=1) return ds
def create_dataset(data_path, is_train=True, batch_size=32): # import import mindspore.common.dtype as mstype import mindspore.dataset.engine as de import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset data_path = os.path.join(data_path, "train" if is_train else "val") ds = de.ImageFolderDataset(data_path, shuffle=True, num_parallel_workers=8, num_shards=num_shards, shard_id=shard_id, num_samples=None) # define ops comps_ops = list() # train or val if is_train: comps_ops.append(C.RandomCropDecodeResize(224, scale=(0.08, 1.0), ratio=(0.75, 1.333))) comps_ops.append(C.RandomHorizontalFlip(prob=0.5)) comps_ops.append(C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)) else: comps_ops.append(C.Decode()) comps_ops.append(C.Resize(224)) comps_ops.append(C.CenterCrop(224)) comps_ops.append(C.Rescale(1 / 255.0, 0.)) comps_ops.append(C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])) comps_ops.append(C.HWC2CHW()) # map ops ds = ds.map(input_columns=["image"], operations=comps_ops, num_parallel_workers=8) ds = ds.map(input_columns=["label"], operations=C2.TypeCast(mstype.int32), num_parallel_workers=8) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=is_train) ds = ds.repeat(count=1) return ds
def csd_train(train_loader, net, opt): set_seed(1) device_id = int(os.getenv('DEVICE_ID', '0')) print("[CSD] Start Training...") step_size = train_loader.get_dataset_size() lr = [] for i in range(0, opt.epochs): cur_lr = opt.lr / (2 ** ((i + 1) // 200)) lr.extend([cur_lr] * step_size) optim = nn.Adam(net.trainable_params(), learning_rate=lr, loss_scale=opt.loss_scale) # net_with_loss = NetWithLossCell(net) net_with_loss = NetWithCSDLossCell(net, args.contra_lambda, args.neg_num) train_cell = TrainOneStepCell(net_with_loss, optim) net.set_train() eval_net = net # time_cb = TimeMonitor(data_size=step_size) # loss_cb = LossMonitor() # metrics = { # "psnr": PSNR(rgb_range=opt.rgb_range, shave=True), # } # eval_cb = EvalCallBack(eval_net, eval_ds, args.test_every, step_size / opt.batch_size, metrics=metrics, # rank_id=rank_id) # cb = [time_cb, loss_cb] # config_ck = CheckpointConfig(save_checkpoint_steps=opt.ckpt_save_interval * step_size, # keep_checkpoint_max=opt.ckpt_save_max) # ckpt_cb = ModelCheckpoint(prefix=opt.filename, directory=opt.ckpt_save_path, config=config_ck) # if device_id == 0: # cb += [ckpt_cb] for epoch in range(0, opt.epochs): epoch_loss = 0 for iteration, batch in enumerate(train_loader.create_dict_iterator(), 1): lr = batch["LR"] hr = batch["HR"] loss = train_cell(lr, hr, Tensor(opt.stu_width_mult), Tensor(1.0)) epoch_loss += loss print(f"Epoch[{epoch}] loss: {epoch_loss.asnumpy()}") # with eval_net.set_train(False): # do_eval(eval_ds, eval_net) if (epoch) % 10 == 0: print('===> Saving model...') save_checkpoint(net, f'./ckpt/{opt.filename}.ckpt')
def __init__(self, in_channels, out_channels, kernel_size, vocab_size, embedding_size, output_channels, target, sparse): super().__init__() set_seed(5) self.relu = ReLU() self.conv = Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, has_bias=True, weight_init='normal') self.batchnorm = BatchNorm2d(num_features=out_channels) self.embedding_lookup = EmbeddingLookup(vocab_size=vocab_size, embedding_size=embedding_size, param_init='normal', target=target, sparse=sparse) self.flatten = Flatten() self.cast = op.Cast() self.bias = Parameter(Tensor(np.ones([output_channels]).astype(np.float32)), name='bias') self.biasadd = op.BiasAdd() self.type = mindspore.int32
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def create_dataset(data_path, is_train=True, batch_size=32): # import import mindspore.common.dtype as mstype import mindspore.dataset.engine as de import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C from mindspore.common import set_seed from mindspore.dataset.vision import Inter set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset data_path = os.path.join(data_path, "train" if is_train else "test") ds = de.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) # define ops comps_ops = list() comps_ops.append(C.Resize((32, 32), interpolation=Inter.LINEAR)) comps_ops.append(C.Rescale(1 / 0.3081, -1 * 0.1307 / 0.3081)) comps_ops.append(C.Rescale(1 / 255., 0.)) comps_ops.append(C.HWC2CHW()) # map ops ds = ds.map(input_columns=["image"], operations=comps_ops, num_parallel_workers=8) ds = ds.map(input_columns=["label"], operations=C2.TypeCast(mstype.int32), num_parallel_workers=8) # batch & repeat ds = ds.shuffle(buffer_size=1000) ds = ds.batch(batch_size=batch_size, drop_remainder=is_train) ds = ds.repeat(count=1) return ds
def get_slice(rank): set_seed(1) hccl = Hccl() rank_save = hccl.rank_id hccl.rank_id = rank context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") strategy1 = ((2, 1), (4, 1)) strategy2 = ((2, 4), ) context.set_context(mode=context.GRAPH_MODE) exe = me._executor x = Tensor(np.ones([32, 32]), dtype=ms.float32) weight = initializer(init_name, [64, 32], ms.float32) net = Net(strategy1, strategy2, weight) net.set_auto_parallel() exe.compile(net, x, auto_parallel_mode=True, phase='train') hccl.rank_id = rank_save return net.parameters_dict()['w1'].data.asnumpy()
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) cache_enable = config.vocab_cache_size > 0 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(parameter_server and cache_enable))
def create_dataset(data_path, is_train=True, batch_size=32): # import import mindspore.dataset.engine as de import mindspore.dataset.vision.c_transforms as C from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset ds = de.MindDataset(data_path, columns_list=['data'], shuffle=True, num_shards=num_shards, shard_id=shard_id, num_parallel_workers=8, num_samples=None) # map ops ds = ds.map(input_columns=["data"], operations=C.Decode()) ds = ds.map(input_columns=["data"], operations=C.Normalize( mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], std=[0.229 * 255, 0.224 * 255, 0.225 * 255])) ds = ds.map(input_columns=["data"], operations=C.Resize((224, 224))) ds = ds.map(input_columns=["data"], operations=C.HWC2CHW()) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=is_train) ds = ds.repeat(count=1) return ds
def main(): set_seed(1) date = time.strftime("%Y%m%d%H%M%S", time.localtime()) print(f'* Preparing to train model {date}') # ************** configuration **************** # - training setting resume = config['resume'] if config['mode'] == 'PYNATIVE': mode = context.PYNATIVE_MODE else: mode = context.GRAPH_MODE device = config['device'] device_id = config['device_id'] dataset_sink_mode = config['dataset_sink_mode'] # use in dataset div = 8 # setting bias and padding if resume: print('* Resuming model...') resume_config_log = config['resume_config_log'] resume_config = get_eval_config(resume_config_log) if 'best_ckpt' in resume_config.keys(): resume_model_path = resume_config['best_ckpt'] else: resume_model_path = resume_config['latest_model'] print('* [WARNING] Not using the best model, but latest saved model instead.') has_bias = resume_config['has_bias'] use_dropout = resume_config['use_dropout'] pad_mode = resume_config['pad_mode'] if pad_mode == 'pad': padding = resume_config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") best_acc = resume_config['best_acc'] best_ckpt = resume_config['best_ckpt'] print('* The best accuracy in dev dataset for the current resumed model is {:.2f}%'.format(best_acc * 100)) else: has_bias = config['has_bias'] use_dropout = config['use_dropout'] pad_mode = config['pad_mode'] if pad_mode == 'pad': padding = config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") # hyper-parameters if resume: batch_size = resume_config['batch_size'] opt_type = resume_config['opt'] use_dynamic_lr = resume_config['use_dynamic_lr'] warmup_step = resume_config['warmup_step'] warmup_ratio = resume_config['warmup_ratio'] else: batch_size = config['batch_size'] opt_type = config['opt'] use_dynamic_lr = config['use_dynamic_lr'] warmup_step = config['warmup_step'] warmup_ratio = config['warmup_ratio'] test_dev_batch_size = config['test_dev_batch_size'] learning_rate = float(config['learning_rate']) epochs = config['epochs'] loss_scale = config['loss_scale'] # configuration of saving model checkpoint save_checkpoint_steps = config['save_checkpoint_steps'] keep_checkpoint_max = config['keep_checkpoint_max'] prefix = config['prefix'] + '_' + date model_dir = config['model_dir'] # loss monitor loss_monitor_step = config['loss_monitor_step'] # whether to use mindInsight summary use_summary = config['use_summary'] # step_eval use_step_eval = config['use_step_eval'] eval_step = config['eval_step'] eval_epoch = config['eval_epoch'] patience = config['patience'] # eval in steps or epochs step_eval = True if eval_step == -1: step_eval = False # ************** end of configuration ************** if device == 'GPU': context.set_context(mode=mode, device_target=device, device_id=device_id) elif device == 'Ascend': import moxing as mox from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH obs_datapath = config['obs_datapath'] obs_saved_model = config['obs_saved_model'] obs_best_model = config['obs_best_model'] obs_log = config['obs_log'] mox.file.copy_parallel(obs_datapath, DATA_PATH) mox.file.copy_parallel(MODEL_PATH, obs_saved_model) mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model) mox.file.copy_parallel(LOG_PATH, obs_log) context.set_context(mode=mode, device_target=device) use_summary = False # callbacks function callbacks = [] # data train_loader, idx2label, label2idx = get_dataset(batch_size=batch_size, phase='train', test_dev_batch_size=test_dev_batch_size, div=div, num_parallel_workers=4) if eval_step == 0: eval_step = train_loader.get_dataset_size() # network net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode, has_bias=has_bias, use_dropout=use_dropout) # Criterion criterion = CTCLoss() # resume if resume: print("* Loading parameters...") param_dict = load_checkpoint(resume_model_path) # load the parameter into net load_param_into_net(net, param_dict) print(f'* Parameters loading from {resume_model_path} succeeded!') net.set_train(True) net.set_grad(True) # lr schedule if use_dynamic_lr: dataset_size = train_loader.get_dataset_size() learning_rate = Tensor(dynamic_lr(base_lr=learning_rate, warmup_step=warmup_step, warmup_ratio=warmup_ratio, epochs=epochs, steps_per_epoch=dataset_size), mstype.float32) print('* Using dynamic learning rate, which will be set up as :', learning_rate.asnumpy()) # optim if opt_type == 'adam': opt = nn.Adam(net.trainable_params(), learning_rate=learning_rate, beta1=0.9, beta2=0.999, weight_decay=0.0, eps=10e-8) elif opt_type == 'rms': opt = nn.RMSProp(params=net.trainable_params(), centered=True, learning_rate=learning_rate, momentum=0.9, loss_scale=loss_scale) elif opt_type == 'sgd': opt = nn.SGD(params=net.trainable_params(), learning_rate=learning_rate) else: raise ValueError(f"optimizer: {opt_type} is not supported for now!") if resume: # load the parameter into optimizer load_param_into_net(opt, param_dict) # save_model config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix=prefix, directory=model_dir, config=config_ck) # logger the_logger = logger(config, date) log = Logging(logger=the_logger, model_ckpt=ckpt_cb) callbacks.append(ckpt_cb) callbacks.append(log) net = WithLossCell(net, criterion) scaling_sens = Tensor(np.full((1), loss_scale), dtype=mstype.float32) net = DFCNNCTCTrainOneStepWithLossScaleCell(net, opt, scaling_sens) net.set_train(True) model = Model(net) if use_step_eval: # step evaluation step_eval = StepAccInfo(model=model, name=prefix, div=div, test_dev_batch_size=test_dev_batch_size, step_eval=step_eval, eval_step=eval_step, eval_epoch=eval_epoch, logger=the_logger, patience=patience, dataset_size=train_loader.get_dataset_size()) callbacks.append(step_eval) # loss monitor loss_monitor = LossMonitor(loss_monitor_step) callbacks.append(loss_monitor) if use_summary: summary_dir = os.path.join(SUMMARY_DIR, date) if not os.path.exists(summary_dir): os.mkdir(summary_dir) # mindInsight summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1, max_file_size=4 * 1024 ** 3) callbacks.append(summary_collector) if resume: the_logger.update_acc_ckpt(best_acc, best_ckpt) print(f'* Start training...') model.train(epochs, train_loader, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode)
def train(): """Train model.""" parser = argparse.ArgumentParser(description='GCN') parser.add_argument('--data_dir', type=str, default='./data/cora/cora_mr', help='Dataset directory') parser.add_argument('--seed', type=int, default=0, help='Random seed') parser.add_argument('--train_nodes_num', type=int, default=140, help='Nodes numbers for training') parser.add_argument('--eval_nodes_num', type=int, default=500, help='Nodes numbers for evaluation') parser.add_argument('--test_nodes_num', type=int, default=1000, help='Nodes numbers for test') parser.add_argument('--save_TSNE', type=ast.literal_eval, default=False, help='Whether to save t-SNE graph') args_opt = parser.parse_args() if not os.path.exists("ckpts"): os.mkdir("ckpts") set_seed(args_opt.seed) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) config = ConfigGCN() adj, feature, label_onehot, label = get_adj_features_labels(args_opt.data_dir) nodes_num = label_onehot.shape[0] train_mask = get_mask(nodes_num, 0, args_opt.train_nodes_num) eval_mask = get_mask(nodes_num, args_opt.train_nodes_num, args_opt.train_nodes_num + args_opt.eval_nodes_num) test_mask = get_mask(nodes_num, nodes_num - args_opt.test_nodes_num, nodes_num) class_num = label_onehot.shape[1] gcn_net = GCN(config, adj, feature, class_num) gcn_net.add_flags_recursive(fp16=True) eval_net = LossAccuracyWrapper(gcn_net, label_onehot, eval_mask, config.weight_decay) train_net = TrainNetWrapper(gcn_net, label_onehot, train_mask, config) loss_list = [] if args_opt.save_TSNE: out_feature = gcn_net() tsne_result = t_SNE(out_feature.asnumpy(), 2) graph_data = [] graph_data.append(tsne_result) fig = plt.figure() scat = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], s=2, c=label, cmap='rainbow') plt.title('t-SNE visualization of Epoch:0', fontsize='large', fontweight='bold', verticalalignment='center') for epoch in range(config.epochs): t = time.time() train_net.set_train() train_result = train_net() train_loss = train_result[0].asnumpy() train_accuracy = train_result[1].asnumpy() eval_net.set_train(False) eval_result = eval_net() eval_loss = eval_result[0].asnumpy() eval_accuracy = eval_result[1].asnumpy() loss_list.append(eval_loss) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss), "train_acc=", "{:.5f}".format(train_accuracy), "val_loss=", "{:.5f}".format(eval_loss), "val_acc=", "{:.5f}".format(eval_accuracy), "time=", "{:.5f}".format(time.time() - t)) if args_opt.save_TSNE: out_feature = gcn_net() tsne_result = t_SNE(out_feature.asnumpy(), 2) graph_data.append(tsne_result) if epoch > config.early_stopping and loss_list[-1] > np.mean(loss_list[-(config.early_stopping+1):-1]): print("Early stopping...") break save_checkpoint(gcn_net, "ckpts/gcn.ckpt") gcn_net_test = GCN(config, adj, feature, class_num) load_checkpoint("ckpts/gcn.ckpt", net=gcn_net_test) gcn_net_test.add_flags_recursive(fp16=True) test_net = LossAccuracyWrapper(gcn_net_test, label_onehot, test_mask, config.weight_decay) t_test = time.time() test_net.set_train(False) test_result = test_net() test_loss = test_result[0].asnumpy() test_accuracy = test_result[1].asnumpy() print("Test set results:", "loss=", "{:.5f}".format(test_loss), "accuracy=", "{:.5f}".format(test_accuracy), "time=", "{:.5f}".format(time.time() - t_test)) if args_opt.save_TSNE: ani = animation.FuncAnimation(fig, update_graph, frames=range(config.epochs + 1), fargs=(graph_data, scat, plt)) ani.save('t-SNE_visualization.gif', writer='imagemagick')
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Evaluation NAML.""" from mindspore.common import set_seed from mindspore.train.serialization import load_checkpoint from src.naml import NAML, NAMLWithLossCell from src.option import get_args from src.dataset import MINDPreprocess from src.utils import NAMLMetric, get_metric if __name__ == '__main__': args = get_args("eval") set_seed(args.seed) net = NAML(args) net.set_train(False) net_with_loss = NAMLWithLossCell(net) load_checkpoint(args.checkpoint_path, net_with_loss) news_encoder = net.news_encoder user_encoder = net.user_encoder metric = NAMLMetric() mindpreprocess = MINDPreprocess(vars(args), dataset_path=args.eval_dataset_path) get_metric(args, mindpreprocess, news_encoder, user_encoder, metric)
def construct(self, minval, maxval): set_seed(20) return C.uniform(self.shape, minval, maxval, self.seed)
from mindspore import context from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size from mindspore.nn.optim.rmsprop import RMSProp from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed from mindspore.common import dtype as mstype from src.config import nasnet_a_mobile_config_gpu as cfg from src.dataset import create_dataset from src.nasnet_a_mobile import NASNetAMobileWithLoss from src.lr_generator import get_lr set_seed(cfg.random_seed) if __name__ == '__main__': parser = argparse.ArgumentParser( description='image classification training') parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') parser.add_argument('--is_distributed', action='store_true', default=False,
def train_net(distribute, imagenet): """Train net with finetune""" set_seed(1) device_id = int(os.getenv('DEVICE_ID', '0')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) if imagenet == 1: train_dataset = ImgData(args) elif not args.derain: train_dataset = DIV2K(args, name=args.data_train, train=True, benchmark=False) train_dataset.set_scale(args.task_id) else: train_dataset = SRData(args, name=args.data_train, train=True, benchmark=False) train_dataset.set_scale(args.task_id) if distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) print('Rank {}, group_size {}'.format(rank_id, rank_size)) if imagenet == 1: train_de_dataset = ds.GeneratorDataset( train_dataset, ["HR", "Rain", "LRx2", "LRx3", "LRx4", "scales", "filename"], num_shards=rank_size, shard_id=rank_id, shuffle=True) else: train_de_dataset = ds.GeneratorDataset( train_dataset, ["LR", "HR", "idx", "filename"], num_shards=rank_size, shard_id=rank_id, shuffle=True) else: if imagenet == 1: train_de_dataset = ds.GeneratorDataset( train_dataset, ["HR", "Rain", "LRx2", "LRx3", "LRx4", "scales", "filename"], shuffle=True) else: train_de_dataset = ds.GeneratorDataset( train_dataset, ["LR", "HR", "idx", "filename"], shuffle=True) if args.imagenet == 1: resize_fuc = bicubic() train_de_dataset = train_de_dataset.batch( args.batch_size, input_columns=[ "HR", "Rain", "LRx2", "LRx3", "LRx4", "scales", "filename" ], output_columns=["LR", "HR", "idx", "filename"], drop_remainder=True, per_batch_map=resize_fuc.forward) else: train_de_dataset = train_de_dataset.batch(args.batch_size, drop_remainder=True) train_loader = train_de_dataset.create_dict_iterator(output_numpy=True) net_m = IPT(args) print("Init net weights successfully") if args.pth_path: param_dict = load_checkpoint(args.pth_path) load_param_into_net(net_m, param_dict) print("Load net weight successfully") train_func = Trainer(args, train_loader, net_m) for epoch in range(0, args.epochs): train_func.update_learning_rate(epoch) train_func.train()
def construct(self, mean): set_seed(20) return C.poisson(self.shape, mean, self.seed)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ import numpy as np import mindspore.context as context import mindspore.nn as nn from mindspore import Tensor from mindspore.common import dtype as mstype from mindspore.ops import composite as C from mindspore.common import set_seed context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") set_seed(20) class Net(nn.Cell): def __init__(self, shape, seed=0): super(Net, self).__init__() self.shape = shape self.seed = seed def construct(self, mean): return C.poisson(self.shape, mean, self.seed) def test_net_1D(): seed = 10 shape = (3, 2, 4)
args = parser.parse_args() if args.data_name == "ag": from src.config import config_ag as config elif args.data_name == 'dbpedia': from src.config import config_db as config elif args.data_name == 'yelp_p': from src.config import config_yelpp as config def get_ms_timestamp(): t = time.time() return int(round(t * 1000)) set_seed(5) time_stamp_init = False time_stamp_first = 0 rank_id = os.getenv('DEVICE_ID') context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend") class LossCallBack(Callback): """ Monitor the loss in training. If the loss is NAN or INF terminating training. Note:
net_with_accumulation = ( BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else BertTrainAccumulationAllReduceEachWithLossScaleCell) net_with_grads = net_with_accumulation( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model = ConvertModelUtils().convert_to_thor_model( model, network=net_with_grads, optimizer=optimizer, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps) if __name__ == '__main__': set_seed(0) run_pretrain()
def train(): """train""" set_seed(1) device_id = int(os.getenv('DEVICE_ID', '0')) rank_id = int(os.getenv('RANK_ID', '0')) device_num = int(os.getenv('RANK_SIZE', '1')) # context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False, device_id=device_id) context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU", save_graphs=False, device_id=device_id) if device_num > 1: init() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, device_num=device_num, global_rank=device_id, gradients_mean=True) if args.modelArts_mode: import moxing as mox local_data_url = '/cache/data' mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) train_dataset = DIV2K(args, name=args.data_train, train=True, benchmark=False) train_dataset.set_scale(args.task_id) print(len(train_dataset)) train_de_dataset = ds.GeneratorDataset(train_dataset, ["LR", "HR"], num_shards=device_num, shard_id=rank_id, shuffle=True) train_de_dataset = train_de_dataset.batch(args.batch_size, drop_remainder=True) eval_dataset = SRData(args, name=args.data_test, train=False, benchmark=True) print(len(eval_dataset)) eval_ds = ds.GeneratorDataset(eval_dataset, ['LR', 'HR'], shuffle=False) eval_ds = eval_ds.batch(1, drop_remainder=True) # net_m = RCAN(args) net_m = EDSR(args) print("Init net weights successfully") if args.ckpt_path: param_dict = load_checkpoint(args.pth_path) load_param_into_net(net_m, param_dict) print("Load net weight successfully") step_size = train_de_dataset.get_dataset_size() lr = [] for i in range(0, args.epochs): cur_lr = args.lr / (2**((i + 1) // 200)) lr.extend([cur_lr] * step_size) opt = nn.Adam(net_m.trainable_params(), learning_rate=lr, loss_scale=args.loss_scale) loss = nn.L1Loss() loss_scale_manager = DynamicLossScaleManager(init_loss_scale=args.init_loss_scale, \ scale_factor=2, scale_window=1000) eval_net = net_m model = Model(net_m, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() metrics = { "psnr": PSNR(rgb_range=args.rgb_range, shave=True), } eval_cb = EvalCallBack(eval_net, eval_ds, args.test_every, step_size / args.batch_size, metrics=metrics, rank_id=rank_id) cb = [time_cb, loss_cb, eval_cb] config_ck = CheckpointConfig( save_checkpoint_steps=args.ckpt_save_interval * step_size, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(prefix=args.filename, directory=args.ckpt_save_path, config=config_ck) if device_id == 0: cb += [ckpt_cb] model.train(args.epochs, train_de_dataset, callbacks=cb, dataset_sink_mode=True)
def construct(self, alpha, beta): set_seed(20) return C.gamma(self.shape, alpha, beta, self.seed)
epoch_count=config.epochs, sink_mode=config.dataset_sink_mode, sink_step=config.dataset_sink_step) if config.test_dataset else None _build_training_pipeline(config=config, pre_training_dataset=pre_train_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset) def _check_args(config): if not os.path.exists(config): raise FileNotFoundError("`config` is not existed.") if not isinstance(config, str): raise ValueError("`config` must be type of str.") if __name__ == '__main__': _rank_size = os.getenv('RANK_SIZE') args, _ = parser.parse_known_args() _check_args(args.config) _config = get_config(args.config) _config.dataset_schema = args.dataset_schema_train _config.pre_train_dataset = args.pre_train_dataset set_seed(_config.random_seed) if _rank_size is not None and int(_rank_size) > 1: train_parallel(_config) else: train_single(_config)
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) if cache_enable: config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) if cache_enable: config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") context.set_auto_parallel_context( strategy_ckpt_save_file=config.stra_ckpt) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) if _is_role_worker(): if cache_enable: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * epochs, keep_checkpoint_max=1, integrated_save=False) else: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) else: ckptconfig = CheckpointConfig(save_checkpoint_steps=1, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=bool(parameter_server and cache_enable))
from mindspore.common import dtype as mstype from mindspore.communication.management import get_rank from mindspore.train.model import Model from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.serialization import save_checkpoint from mindspore.common import set_seed from src.dataset import create_dataset, extract_features from src.lr_generator import get_lr from src.config import set_config from src.args import train_parse_args from src.utils import context_device_init, switch_precision, config_ckpoint from src.models import CrossEntropyWithLabelSmooth, define_net, load_ckpt set_seed(1) if __name__ == '__main__': args_opt = train_parse_args() args_opt.dataset_path = os.path.abspath(args_opt.dataset_path) config = set_config(args_opt) start = time.time() print(f"train args: {args_opt}\ncfg: {config}") #set context and device init context_device_init(config) # define network backbone_net, head_net, net = define_net(config, args_opt.is_training) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config)
def construct(self, mean, stddev): set_seed(20) return C.normal(self.shape, mean, stddev, self.seed)