Exemplo n.º 1
0
def test_resnet50_cifar10_gpu():
    cur_path = os.getcwd()
    model_path = "{}/../../../../model_zoo/official/cv".format(cur_path)
    model_name = "resnet"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, "resnet")
    old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"]
    new_list = ["total_epochs=10", "10"]
    utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
    dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
    exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh resnet50 cifar10 {}".format(dataset_path)
    logger.warning("cmd [{}] is running...".format(exec_network_shell))
    os.system(exec_network_shell)
    cmd = "ps -ef | grep python | grep train.py | grep -v grep"
    ret = utils.process_check(100, cmd)
    assert ret
    log_file = os.path.join(cur_model_path, "scripts/train_parallel/log")
    pattern = r"per step time: ([\d\.]+) ms"
    step_time_list = utils.parse_log_file(pattern, log_file)[8:]
    per_step_time = sum(step_time_list) / len(step_time_list)
    print("step time list is", step_time_list)
    assert per_step_time < 115
    loss_list = utils.get_loss_data_list(log_file)[-8:]
    print("loss_list is", loss_list)
    assert sum(loss_list) / len(loss_list) < 0.70
Exemplo n.º 2
0
def test_resnet50_cifar10_ascend():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/cv".format(cur_path)
    model_name = "resnet"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, "resnet")
    old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"]
    new_list = ["total_epochs=10", "10"]
    utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
    dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
    exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\
        .format(utils.rank_table_path, dataset_path)
    os.system(exec_network_shell)
    cmd = "ps -ef | grep python | grep train.py | grep -v grep"
    ret = utils.process_check(100, cmd)
    assert ret
    log_file = os.path.join(cur_model_path, "scripts/train_parallel{}/log")
    for i in range(8):
        per_step_time = utils.get_perf_data(log_file.format(i))
        assert per_step_time < 20.0
    loss_list = []
    for i in range(8):
        loss = utils.get_loss_data_list(log_file.format(i))
        loss_list.append(loss[-1])
    assert sum(loss_list) / len(loss_list) < 0.70
Exemplo n.º 3
0
def test_DeeplabV3_voc2007():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/cv".format(cur_path)
    model_name = "deeplabv3"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, model_name)

    old_list = [
        '/PATH/TO/EXPERIMENTS_DIR', '/PATH/TO/MODEL_ZOO_CODE',
        '/PATH/TO/MINDRECORD_NAME', '/PATH/TO/PRETRAIN_MODEL',
        "\\${train_code_path}/src/tools/rank_table_8p.json"
    ]
    new_list = [
        cur_model_path + '/train', cur_model_path,
        os.path.join(utils.data_root,
                     "voc/voc2012/mindrecord_train/vocaug_mindrecord0"),
        os.path.join(utils.ckpt_root, "deeplabv3/resnet101_ascend.ckpt"),
        utils.rank_table_path
    ]
    utils.exec_sed_command(
        old_list, new_list,
        os.path.join(cur_model_path, "scripts/run_distribute_train_s16_r1.sh"))

    old_list = ['model.train(args.train_epochs', 'callbacks=cbs']
    new_list = ['model.train(70', 'callbacks=cbs, sink_size=2']
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "train.py"))

    exec_network_shell = "cd {}; sh scripts/run_distribute_train_s16_r1.sh".format(
        model_name)
    ret = os.system(exec_network_shell)
    assert ret == 0
    cmd = "ps -ef | grep python | grep train.py | grep -v grep"
    ret = utils.process_check(100, cmd)
    assert ret

    log_file = os.path.join(cur_model_path, "train/device{}/log")
    for i in range(8):
        per_step_time = utils.get_perf_data(log_file.format(i))
        print("per_step_time is", per_step_time)
        assert per_step_time < 530.0
    loss_list = []
    for i in range(8):
        loss = utils.get_loss_data_list(log_file.format(i))
        print("loss is", loss[-1])
        loss_list.append(loss[-1])
    assert sum(loss_list) / len(loss_list) < 2.5
Exemplo n.º 4
0
def test_transformer_export_mindir():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/nlp".format(cur_path)
    model_name = "transformer"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, model_name)
    export_file = "transformer80_bs_0"
    ckpt_path = os.path.join(utils.ckpt_root,
                             "transformer/transformer_trained.ckpt")
    print("ckpt_path:", ckpt_path)
    old_list = ["'model_file': '/your/path/checkpoint_file'"]
    new_list = ["'model_file': '{}'".format(ckpt_path)]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "src/eval_config.py"))
    old_list = ["context.set_context(device_id=args.device_id)"]
    new_list = ["context.set_context()"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "export.py"))
    exec_export_shell = "cd transformer; python -u export.py --file_name={}" \
                        " --file_format=MINDIR".format(export_file)
    os.system(exec_export_shell)
    assert os.path.exists(
        os.path.join(cur_model_path, "{}.mindir".format(export_file)))
Exemplo n.º 5
0
def test_SSD_mobilenet_v1_fpn_coco2017():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/cv".format(cur_path)
    model_name = "ssd"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, model_name)

    old_list = [
        "/data/MindRecord_COCO", "/ckpt/mobilenet_v1.ckpt", "/data/coco2017"
    ]
    new_list = [
        os.path.join(utils.data_root,
                     "coco/coco2017/mindrecord_train/ssd_mindrecord"),
        os.path.join(utils.ckpt_root, "ssd_mobilenet_v1/mobilenet-v1.ckpt"),
        os.path.join(utils.data_root, "coco/coco2017")
    ]
    utils.exec_sed_command(
        old_list, new_list,
        os.path.join(cur_model_path, "src/config_ssd_mobilenet_v1_fpn.py"))
    old_list = ["ssd300"]
    new_list = ["ssd_mobilenet_v1_fpn"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "src/config.py"))
    old_list = ["args_opt.epoch_size", "dataset_sink_mode=dataset_sink_mode"]
    new_list = ["5", "dataset_sink_mode=dataset_sink_mode, sink_size=100"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "train.py"))

    exec_network_shell = "cd {0}; sh -x scripts/run_distribute_train.sh 8 {1} 0.2 coco {2}"\
        .format(model_name, 60, utils.rank_table_path)
    os.system(exec_network_shell)
    cmd = "ps -ef | grep train.py | grep coco | grep device_num | grep device_id | grep -v grep"
    ret = utils.process_check(120, cmd)
    assert ret

    log_file = os.path.join(cur_model_path, "LOG{}/log.txt")
    for i in range(8):
        per_step_time = utils.get_perf_data(log_file.format(i))
        print("per_step_time is", per_step_time)
        assert per_step_time < 545
    loss_list = []
    for i in range(8):
        loss = utils.get_loss_data_list(log_file.format(i))
        print("loss is", loss[-1])
        loss_list.append(loss[-1])
    assert sum(loss_list) / len(loss_list) < 2.72
Exemplo n.º 6
0
def test_BGCF_amazon_beauty():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/gnn".format(cur_path)
    model_name = "bgcf"
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, model_name)

    old_list = ["--datapath=../data_mr"]
    new_list = [
        "--datapath={}".format(
            os.path.join(utils.data_root, "amazon_beauty/mindrecord_train"))
    ]
    utils.exec_sed_command(
        old_list, new_list,
        os.path.join(cur_model_path, "scripts/run_train_ascend.sh"))
    old_list = ["default=600,"]
    new_list = ["default=50,"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "src/config.py"))
    old_list = ["context.set_context(device_id=int(parser.device))"]
    new_list = ["context.set_context()"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "train.py"))
    exec_network_shell = "cd {}/scripts; bash run_train_ascend.sh".format(
        model_name)
    ret = os.system(exec_network_shell)
    assert ret == 0

    cmd = "ps -ef|grep python |grep train.py|grep amazon_beauty|grep -v grep"
    ret = utils.process_check(300, cmd)
    assert ret

    log_file = os.path.join(cur_model_path, "scripts/train/log")
    pattern1 = r"loss ([\d\.\+]+)\,"
    loss_list = utils.parse_log_file(pattern1, log_file)
    loss_list = loss_list[-5:]
    print("last 5 epoch average loss is", sum(loss_list) / len(loss_list))
    assert sum(loss_list) / len(loss_list) < 6400

    pattern1 = r"cost:([\d\.\+]+)"
    epoch_time_list = utils.parse_log_file(pattern1, log_file)[1:]
    print("per epoch time:", sum(epoch_time_list) / len(epoch_time_list))
    assert sum(epoch_time_list) / len(epoch_time_list) < 1.9
Exemplo n.º 7
0
def test_yolov3_darknet_8p():
    cur_path = os.path.dirname(os.path.abspath(__file__))
    model_path = "{}/../../../../model_zoo/official/cv".format(cur_path)
    model_name = "yolov3_darknet53"
    dataset_path = os.path.join(utils.data_root, "coco/coco2014/")
    ckpt_path = os.path.join(utils.ckpt_root,
                             "yolov3_darknet/yolov3_darknet53_pretrain.ckpt")
    utils.copy_files(model_path, cur_path, model_name)
    cur_model_path = os.path.join(cur_path, model_name)
    train_file = os.path.join(cur_model_path, "train.py")
    old_list = ["--lr_scheduler=cosine_annealing"]
    new_list = ["--lr_scheduler=cosine_annealing --training_shape=416"]
    utils.exec_sed_command(
        old_list, new_list,
        os.path.join(cur_model_path, "scripts/run_distribute_train.sh"))
    old_list = ["default=100", "max_epoch=args.max_epoch"]
    new_list = ["default=10", "max_epoch=1"]
    utils.exec_sed_command(old_list, new_list, train_file)
    old_list = ["sampler=distributed_sampler"]
    new_list = ["sampler=distributed_sampler, num_samples=100*batch_size"]
    utils.exec_sed_command(old_list, new_list,
                           os.path.join(cur_model_path, "src/yolo_dataset.py"))
    exec_network_shell = "cd yolov3_darknet53/scripts; bash run_distribute_train.sh {0} {1} {2}"\
        .format(dataset_path, ckpt_path, utils.rank_table_path)
    os.system(exec_network_shell)
    cmd = "ps -ef | grep python | grep train.py | grep -v grep"
    ret = utils.process_check(120, cmd)
    assert ret
    train_log_file = os.path.join(
        cur_path, "yolov3_darknet53/scripts/train_parallel0/log.txt")
    pattern1 = r", *([\d\.]+) imgs/sec"
    pattern2 = r"loss:*([\d\.]+),"
    fps_list = utils.parse_log_file(pattern1, train_log_file)[1:]
    assert sum(fps_list) / len(fps_list) > 480
    loss_list = utils.parse_log_file(pattern2, train_log_file)
    assert loss_list[-1] < 280