def test_resnet50_cifar10_gpu(): cur_path = os.getcwd() model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "resnet" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, "resnet") old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"] new_list = ["total_epochs=10", "10"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path) logger.warning("cmd [{}] is running...".format(exec_network_shell)) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) assert ret log_file = os.path.join(cur_model_path, "scripts/train_parallel/log") pattern = r"per step time: ([\d\.]+) ms" step_time_list = utils.parse_log_file(pattern, log_file)[8:] per_step_time = sum(step_time_list) / len(step_time_list) print("step time list is", step_time_list) assert per_step_time < 115 loss_list = utils.get_loss_data_list(log_file)[-8:] print("loss_list is", loss_list) assert sum(loss_list) / len(loss_list) < 0.70
def test_resnet50_cifar10_ascend(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "resnet" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, "resnet") old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"] new_list = ["total_epochs=10", "10"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\ .format(utils.rank_table_path, dataset_path) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) assert ret log_file = os.path.join(cur_model_path, "scripts/train_parallel{}/log") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) assert per_step_time < 20.0 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 0.70
def test_DeeplabV3_voc2007(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "deeplabv3" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) old_list = [ '/PATH/TO/EXPERIMENTS_DIR', '/PATH/TO/MODEL_ZOO_CODE', '/PATH/TO/MINDRECORD_NAME', '/PATH/TO/PRETRAIN_MODEL', "\\${train_code_path}/src/tools/rank_table_8p.json" ] new_list = [ cur_model_path + '/train', cur_model_path, os.path.join(utils.data_root, "voc/voc2012/mindrecord_train/vocaug_mindrecord0"), os.path.join(utils.ckpt_root, "deeplabv3/resnet101_ascend.ckpt"), utils.rank_table_path ] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "scripts/run_distribute_train_s16_r1.sh")) old_list = ['model.train(args.train_epochs', 'callbacks=cbs'] new_list = ['model.train(70', 'callbacks=cbs, sink_size=2'] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) exec_network_shell = "cd {}; sh scripts/run_distribute_train_s16_r1.sh".format( model_name) ret = os.system(exec_network_shell) assert ret == 0 cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) assert ret log_file = os.path.join(cur_model_path, "train/device{}/log") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) print("per_step_time is", per_step_time) assert per_step_time < 530.0 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) print("loss is", loss[-1]) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 2.5
def test_transformer_export_mindir(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/nlp".format(cur_path) model_name = "transformer" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) export_file = "transformer80_bs_0" ckpt_path = os.path.join(utils.ckpt_root, "transformer/transformer_trained.ckpt") print("ckpt_path:", ckpt_path) old_list = ["'model_file': '/your/path/checkpoint_file'"] new_list = ["'model_file': '{}'".format(ckpt_path)] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/eval_config.py")) old_list = ["context.set_context(device_id=args.device_id)"] new_list = ["context.set_context()"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "export.py")) exec_export_shell = "cd transformer; python -u export.py --file_name={}" \ " --file_format=MINDIR".format(export_file) os.system(exec_export_shell) assert os.path.exists( os.path.join(cur_model_path, "{}.mindir".format(export_file)))
def test_SSD_mobilenet_v1_fpn_coco2017(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "ssd" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) old_list = [ "/data/MindRecord_COCO", "/ckpt/mobilenet_v1.ckpt", "/data/coco2017" ] new_list = [ os.path.join(utils.data_root, "coco/coco2017/mindrecord_train/ssd_mindrecord"), os.path.join(utils.ckpt_root, "ssd_mobilenet_v1/mobilenet-v1.ckpt"), os.path.join(utils.data_root, "coco/coco2017") ] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "src/config_ssd_mobilenet_v1_fpn.py")) old_list = ["ssd300"] new_list = ["ssd_mobilenet_v1_fpn"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config.py")) old_list = ["args_opt.epoch_size", "dataset_sink_mode=dataset_sink_mode"] new_list = ["5", "dataset_sink_mode=dataset_sink_mode, sink_size=100"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) exec_network_shell = "cd {0}; sh -x scripts/run_distribute_train.sh 8 {1} 0.2 coco {2}"\ .format(model_name, 60, utils.rank_table_path) os.system(exec_network_shell) cmd = "ps -ef | grep train.py | grep coco | grep device_num | grep device_id | grep -v grep" ret = utils.process_check(120, cmd) assert ret log_file = os.path.join(cur_model_path, "LOG{}/log.txt") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) print("per_step_time is", per_step_time) assert per_step_time < 545 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) print("loss is", loss[-1]) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 2.72
def test_BGCF_amazon_beauty(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/gnn".format(cur_path) model_name = "bgcf" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) old_list = ["--datapath=../data_mr"] new_list = [ "--datapath={}".format( os.path.join(utils.data_root, "amazon_beauty/mindrecord_train")) ] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "scripts/run_train_ascend.sh")) old_list = ["default=600,"] new_list = ["default=50,"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config.py")) old_list = ["context.set_context(device_id=int(parser.device))"] new_list = ["context.set_context()"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) exec_network_shell = "cd {}/scripts; bash run_train_ascend.sh".format( model_name) ret = os.system(exec_network_shell) assert ret == 0 cmd = "ps -ef|grep python |grep train.py|grep amazon_beauty|grep -v grep" ret = utils.process_check(300, cmd) assert ret log_file = os.path.join(cur_model_path, "scripts/train/log") pattern1 = r"loss ([\d\.\+]+)\," loss_list = utils.parse_log_file(pattern1, log_file) loss_list = loss_list[-5:] print("last 5 epoch average loss is", sum(loss_list) / len(loss_list)) assert sum(loss_list) / len(loss_list) < 6400 pattern1 = r"cost:([\d\.\+]+)" epoch_time_list = utils.parse_log_file(pattern1, log_file)[1:] print("per epoch time:", sum(epoch_time_list) / len(epoch_time_list)) assert sum(epoch_time_list) / len(epoch_time_list) < 1.9
def test_yolov3_darknet_8p(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "yolov3_darknet53" dataset_path = os.path.join(utils.data_root, "coco/coco2014/") ckpt_path = os.path.join(utils.ckpt_root, "yolov3_darknet/yolov3_darknet53_pretrain.ckpt") utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) train_file = os.path.join(cur_model_path, "train.py") old_list = ["--lr_scheduler=cosine_annealing"] new_list = ["--lr_scheduler=cosine_annealing --training_shape=416"] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "scripts/run_distribute_train.sh")) old_list = ["default=100", "max_epoch=args.max_epoch"] new_list = ["default=10", "max_epoch=1"] utils.exec_sed_command(old_list, new_list, train_file) old_list = ["sampler=distributed_sampler"] new_list = ["sampler=distributed_sampler, num_samples=100*batch_size"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/yolo_dataset.py")) exec_network_shell = "cd yolov3_darknet53/scripts; bash run_distribute_train.sh {0} {1} {2}"\ .format(dataset_path, ckpt_path, utils.rank_table_path) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(120, cmd) assert ret train_log_file = os.path.join( cur_path, "yolov3_darknet53/scripts/train_parallel0/log.txt") pattern1 = r", *([\d\.]+) imgs/sec" pattern2 = r"loss:*([\d\.]+)," fps_list = utils.parse_log_file(pattern1, train_log_file)[1:] assert sum(fps_list) / len(fps_list) > 480 loss_list = utils.parse_log_file(pattern2, train_log_file) assert loss_list[-1] < 280