def modelarts_pre_process(): '''modelarts pre process function.''' def unzip(zip_file, save_dir): import zipfile s_time = time.time() if not os.path.exists( os.path.join(save_dir, "face_recognition_dataset")): zip_isexist = zipfile.is_zipfile(zip_file) if zip_isexist: fz = zipfile.ZipFile(zip_file, 'r') data_num = len(fz.namelist()) print("Extract Start...") print("unzip file num: {}".format(data_num)) i = 0 for file in fz.namelist(): if i % int(data_num / 100) == 0: print("unzip percent: {}%".format(i / int(data_num / 100)), flush=True) i += 1 fz.extract(file, save_dir) print("cost time: {}min:{}s.".format( int((time.time() - s_time) / 60), int(int(time.time() - s_time) % 60))) print("Extract Done.") else: print("This is not zip.") else: print("Zip has been extracted.") if config.need_modelarts_dataset_unzip: zip_file_1 = os.path.join(config.data_path, "face_recognition_dataset.zip") save_dir_1 = os.path.join(config.data_path) sync_lock = "/tmp/unzip_sync.lock" # Each server contains 8 devices as most. if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): print("Zip file path: ", zip_file_1) print("Unzip file save dir: ", save_dir_1) unzip(zip_file_1, save_dir_1) print("===Finish extract data synchronization===") try: os.mknod(sync_lock) except IOError: pass while True: if os.path.exists(sync_lock): break time.sleep(1) print("Device: {}, Finish sync unzip data from {} to {}.".format( get_device_id(), zip_file_1, save_dir_1)) config.ckpt_path = os.path.join(config.output_path, str(get_rank_id()), config.ckpt_path)
def eval_net(): '''eval net''' if config.dataset == 'MR': instance = MovieReview(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SUBJ': instance = Subjectivity(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SST2': instance = SST2(root_dir=config.data_path, maxlen=config.word_len, split=0.9) device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) if device_target == "Ascend": context.set_context(device_id=get_device_id()) dataset = instance.create_test_dataset(batch_size=config.batch_size) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True) net = TextCNN(vocab_len=instance.get_dict_len(), word_len=config.word_len, num_classes=config.num_classes, vec_length=config.vec_length) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=0.001, weight_decay=float(config.weight_decay)) param_dict = load_checkpoint(config.checkpoint_file_path) print("load checkpoint from [{}].".format(config.checkpoint_file_path)) load_param_into_net(net, param_dict) net.set_train(False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()}) acc = model.eval(dataset) print("accuracy: ", acc)
def run_eval(): """eval method""" if not os.path.exists(config.output_path): os.makedirs(config.output_path) context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", save_graphs=False, device_id=get_device_id()) layers = config.layers num_factors = config.num_factors topk = rconst.TOP_K num_eval_neg = rconst.NUM_EVAL_NEGATIVES ds_eval, num_eval_users, num_eval_items = create_dataset( test_train=False, data_dir=config.data_path, dataset=config.dataset, train_epochs=0, eval_batch_size=config.eval_batch_size) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) ncf_net = NCFModel(num_users=num_eval_users, num_items=num_eval_items, num_factors=num_factors, model_layers=layers, mf_regularization=0, mlp_reg_layers=[0.0, 0.0, 0.0, 0.0], mf_dim=16) param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(ncf_net, param_dict) loss_net = NetWithLossClass(ncf_net) train_net = TrainStepWrap(loss_net) eval_net = PredictWithSigmoid(ncf_net, topk, num_eval_neg) ncf_metric = NCFMetric() model = Model(train_net, eval_network=eval_net, metrics={"ncf": ncf_metric}) ncf_metric.clear() out = model.eval(ds_eval) eval_file_path = os.path.join(config.output_path, config.eval_file_name) eval_file = open(eval_file_path, "a+") eval_file.write("EvalCallBack: HR = {}, NDCG = {}\n".format( out['ncf'][0], out['ncf'][1])) eval_file.close() print("EvalCallBack: HR = {}, NDCG = {}".format(out['ncf'][0], out['ncf'][1])) print("=" * 100 + "Eval Finish!" + "=" * 100)
def eval_alexnet(): print("============== Starting Testing ==============") device_num = get_device_num() if device_num > 1: # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) if config.device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif config.device_target == "GPU": init() if config.dataset_name == 'cifar10': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ target=config.device_target) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) elif config.dataset_name == 'imagenet': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) else: raise ValueError("Unsupported dataset.") if ds_eval.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) print("result : {}".format(result))
from src.dataset_factory import get_de_dataset from src.backbone.resnet import get_backbone from src.metric_factory import get_metric_fc from src.loss_factory import get_loss from src.lrsche_factory import warmup_step_list, list_to_gen from src.callback_factory import ProgressMonitor from utils.moxing_adapter import moxing_wrapper from utils.config import config from utils.device_adapter import get_device_id, get_device_num, get_rank_id mindspore.common.seed.set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=get_device_id(), reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) class DistributedHelper(Cell): '''DistributedHelper''' def __init__(self, backbone, margin_fc): super(DistributedHelper, self).__init__() self.backbone = backbone self.margin_fc = margin_fc if margin_fc is not None: self.has_margin_fc = 1 else: self.has_margin_fc = 0
def train_net(): '''train net''' # set context context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(device_id=get_device_id()) if config.dataset == 'MR': instance = MovieReview(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SUBJ': instance = Subjectivity(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SST2': instance = SST2(root_dir=config.data_path, maxlen=config.word_len, split=0.9) dataset = instance.create_train_dataset(batch_size=config.batch_size, epoch_size=config.epoch_size) batch_num = dataset.get_dataset_size() base_lr = float(config.base_lr) learning_rate = [] warm_up = [ base_lr / math.floor(config.epoch_size / 5) * (i + 1) for _ in range(batch_num) for i in range(math.floor(config.epoch_size / 5)) ] shrink = [ base_lr / (16 * (i + 1)) for _ in range(batch_num) for i in range(math.floor(config.epoch_size * 3 / 5)) ] normal_run = [ base_lr for _ in range(batch_num) for i in range(config.epoch_size - math.floor(config.epoch_size / 5) - math.floor(config.epoch_size * 2 / 5)) ] learning_rate = learning_rate + warm_up + normal_run + shrink net = TextCNN(vocab_len=instance.get_dict_len(), word_len=config.word_len, num_classes=config.num_classes, vec_length=config.vec_length) # Continue training if set pre_trained to be True if config.pre_trained: param_dict = load_checkpoint(config.checkpoint_path) load_param_into_net(net, param_dict) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), \ learning_rate=learning_rate, weight_decay=float(config.weight_decay)) loss = SoftmaxCrossEntropyExpand(sparse=True) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()}) config_ck = CheckpointConfig( save_checkpoint_steps=int(config.epoch_size * batch_num / 2), keep_checkpoint_max=config.keep_checkpoint_max) time_cb = TimeMonitor(data_size=batch_num) ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path) ckpoint_cb = ModelCheckpoint(prefix="train_textcnn", directory=ckpt_save_dir, config=config_ck) loss_cb = LossMonitor() model.train(config.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb]) print("train success")
from utils.device_adapter import get_device_id import numpy as np import mindspore from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export from src.lenet import LeNet5 if os.path.exists(config.data_path_local): ckpt_file = config.ckpt_path_local else: ckpt_file = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) if config.device_target == "Ascend": context.set_context(device_id=get_device_id()) if __name__ == "__main__": # define fusion network network = LeNet5(config.num_classes) # load network checkpoint param_dict = load_checkpoint(ckpt_file) load_param_into_net(network, param_dict) # export network inputs = Tensor( np.ones( [config.batch_size, 1, config.image_height, config.image_width]), mindspore.float32) export(network,
def train(): args = config if args.device_target == "CPU": context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") else: context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False, device_target="Ascend", device_id=get_device_id()) # init multicards training if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, image_std=args.image_std, data_file=args.data_file, batch_size=args.batch_size, crop_size=args.crop_size, max_scale=args.max_scale, min_scale=args.min_scale, ignore_label=args.ignore_label, num_classes=args.num_classes, num_readers=2, num_parallel_calls=4, shard_id=args.rank, shard_num=args.group_size) dataset = dataset.get_dataset(repeat=1) # network if args.model == 'deeplab_v3_s16': network = net_factory.nets_map[args.model]('train', args.num_classes, 16, args.freeze_bn) elif args.model == 'deeplab_v3_s8': network = net_factory.nets_map[args.model]('train', args.num_classes, 8, args.freeze_bn) else: raise NotImplementedError('model [{:s}] not recognized'.format(args.model)) # loss loss_ = loss.SoftmaxCrossEntropyLoss(args.num_classes, args.ignore_label) loss_.add_flags_recursive(fp32=True) train_net = BuildTrainNetwork(network, loss_) # load pretrained model if args.ckpt_pre_trained: param_dict = load_checkpoint(args.ckpt_pre_trained) if args.filter_weight: filter_list = ["network.aspp.conv2.weight", "network.aspp.conv2.bias"] for key in list(param_dict.keys()): for filter_key in filter_list: if filter_key not in key: continue print('filter {}'.format(key)) del param_dict[key] load_param_into_net(train_net, param_dict) print('load_model {} success'.format(args.ckpt_pre_trained)) # optimizer iters_per_epoch = dataset.get_dataset_size() total_train_steps = iters_per_epoch * args.train_epochs if args.lr_type == 'cos': lr_iter = learning_rates.cosine_lr(args.base_lr, total_train_steps, total_train_steps) elif args.lr_type == 'poly': lr_iter = learning_rates.poly_lr(args.base_lr, total_train_steps, total_train_steps, end_lr=0.0, power=0.9) elif args.lr_type == 'exp': lr_iter = learning_rates.exponential_lr(args.base_lr, args.lr_decay_step, args.lr_decay_rate, total_train_steps, staircase=True) else: raise ValueError('unknown learning rate type') opt = nn.Momentum(params=train_net.trainable_params(), learning_rate=lr_iter, momentum=0.9, weight_decay=0.0001, loss_scale=args.loss_scale) # loss scale manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) amp_level = "O0" if args.device_target == "CPU" else "O3" model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale) # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch) loss_cb = LossMonitor() cbs = [time_cb, loss_cb] if args.rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args.save_steps, keep_checkpoint_max=args.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck) cbs.append(ckpoint_cb) model.train(args.train_epochs, dataset, callbacks=cbs, dataset_sink_mode=(args.device_target != "CPU"))
def train_alexnet(): print(config) print('device id:', get_device_id()) print('device num:', get_device_num()) print('rank id:', get_rank_id()) print('job id:', get_job_id()) device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(save_graphs=False) device_num = get_device_num() if config.dataset_name == "cifar10": if device_num > 1: config.learning_rate = config.learning_rate * device_num config.epoch_size = config.epoch_size * 2 elif config.dataset_name == "imagenet": pass else: raise ValueError("Unsupported dataset.") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, \ parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif device_target == "GPU": init() else: context.set_context(device_id=get_device_id()) if config.dataset_name == "cifar10": ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) elif config.dataset_name == "imagenet": ds_train = create_dataset_imagenet(config.data_path, config.batch_size) else: raise ValueError("Unsupported dataset.") if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") network = AlexNet(config.num_classes, phase='train') loss_scale_manager = None metrics = None step_per_epoch = ds_train.get_dataset_size( ) if config.sink_size == -1 else config.sink_size if config.dataset_name == 'cifar10': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(network.trainable_params(), lr, config.momentum) metrics = {"Accuracy": Accuracy()} elif config.dataset_name == 'imagenet': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(params=get_param_groups(network), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager if config.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager( config.loss_scale, drop_overflow_update=False) else: raise ValueError("Unsupported dataset.") if device_target == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager) elif device_target == "GPU": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, loss_scale_manager=loss_scale_manager) else: raise ValueError("Unsupported platform.") if device_num > 1: ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) else: ckpt_save_dir = config.checkpoint_path time_cb = TimeMonitor(data_size=step_per_epoch) config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) print("============== Starting Training ==============") model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size)