Пример #1
0
def get_ckpt_dir(args):
    directory = args.ckpt_dir
    if args.use_kungfu:
        size = kfops.kungfu_current_cluster_size()
        rank = kfops.kungfu_current_rank()
        directory = os.path.join(directory, '%d-%d' % (size, rank))
    return directory
Пример #2
0
def run(args):
    ms.context.set_context(
        mode=ms.context.GRAPH_MODE,
        device_target=args.device,
        save_graphs=False,
    )

    net = LeNet5(
        num_class=10,
        num_channel=3,
        use_bn=args.use_bn,
        dbg_log_tensor=args.log_tensor,
    )

    loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction='mean',
    )
    opt = build_optimizer(args, net)

    if args.mode == 'init':
        save_checkpoint(
            net,
            ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())),
        )

    if args.mode == 'train':
        ds_train = create_dataset(
            args=args,
            data_path=os.path.join(args.data_path, 'train'),
            batch_size=args.device_batch_size,
        )
        if args.init_ckpt:
            print('using init checkpoint %s' % (args.init_ckpt))
            load_ckpt(net, args.init_ckpt)
        train(args, net, loss, opt, ds_train)

    if args.mode == 'test':
        if args.use_kungfu:
            rank = kfops.kungfu_current_rank()
            if rank > 0:
                return
        ds_test = create_dataset(
            args=args,
            data_path=os.path.join(args.data_path, 'test'),
            batch_size=args.device_batch_size,
        )

        if args.ckpt_files:
            checkpoints = args.ckpt_files.split(',')
        else:
            checkpoint_dir = get_ckpt_dir(args)
            print('checkpoint_dir: %s' % (checkpoint_dir))
            checkpoints = list(sorted(glob.glob(checkpoint_dir + '/*.ckpt')))
        print('will test %d checkpoints' % (len(checkpoints)))
        # for i, n in enumerate(checkpoints):
        #     print('[%d]=%s' % (i, n))
        test(args, net, loss, opt, ds_test, checkpoints)
Пример #3
0
def get_eval_result_filename(args):
    timestamp = time.time()
    filename = 'lbs-%d+dbs-%d-%d.txt' % (args.logical_batch_size,
                                         args.device_batch_size, timestamp)
    if args.use_kungfu:
        rank = kfops.kungfu_current_rank()
        filename = 'worker.%d.%s' % (rank, filename)
    filename = os.path.join('plot', filename)
    return filename
Пример #4
0
def create_dataset(args, data_path, batch_size):
    if args.mode == 'train' and args.use_kungfu:
        rank = kfops.kungfu_current_rank()
        size = kfops.kungfu_current_cluster_size()
        ds = de.Cifar10Dataset(
            data_path,
            num_parallel_workers=8,
            shuffle=False,
            num_shards=size,
            shard_id=rank,
        )
        print('using shard %d of %d' % (rank, size))
    else:
        ds = de.Cifar10Dataset(
            data_path,
            num_parallel_workers=8,
            shuffle=False,
        )

    # define map operations
    trans = []
    # if do_train:
    #     trans += [
    #         # C.RandomCrop((32, 32), (4, 4, 4, 4)),
    #         # C.RandomHorizontalFlip(prob=0.5)
    #     ]

    trans += [
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    return ds
Пример #5
0
def get_ckpt_dir(args):
    directory = args.ckpt_dir
    if args.use_kungfu:
        rank = kfops.kungfu_current_rank()
        directory = os.path.join(directory, '%d' % (rank))
    return directory
Пример #6
0
        # GPU target
        else:
            init()
            context.set_auto_parallel_context(
                device_num=get_group_size(),
                parallel_mode=ParallelMode.DATA_PARALLEL,
                gradients_mean=True)
            if args_opt.net == "resnet50":
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[85, 160])
        ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
            get_rank()) + "/"

    if args_opt.run_kungfu:
        kfops.init(args_opt.device_target)
        rank = kfops.kungfu_current_rank()
        size = kfops.kungfu_current_cluster_size()
        print('kungfu rank=%d, size=%d' % (rank, size))
        if args_opt.elastic:
            version = os.getenv('KUNGFU_INIT_CLUSTER_VERSION')
            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
                rank) + '@' + version + "/"
        else:
            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
                rank) + "/"

    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             repeat_num=100,
                             batch_size=config.batch_size,