Пример #1
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--run_num", default='00', type=str)
    parser.add_argument("--yaml_config",
                        default='./config/UNet.yaml',
                        type=str)
    parser.add_argument("--config", default='default', type=str)
    parser.add_argument("--comm_mode", default='slurm-nccl', type=str)
    parser.add_argument("--io_only", action="store_true")
    parser.add_argument("--enable_amp", action="store_true")
    parser.add_argument("--cpu_pipeline", action="store_true")
    parser.add_argument("--no_copy", action="store_true")
    args = parser.parse_args()

    run_num = args.run_num

    params = YParams(os.path.abspath(args.yaml_config), args.config)

    # get env variables
    if (args.comm_mode == "openmpi-nccl"):
        #use pmix server address: only works for single node
        addrport = os.getenv("PMIX_SERVER_URI2").split("//")[1]
        comm_addr = addrport.split(":")[0]
        comm_rank = int(os.getenv('OMPI_COMM_WORLD_RANK', 0))
        comm_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", 0))
    elif (args.comm_mode == "slurm-nccl"):
        comm_addr = os.getenv("SLURM_SRUN_COMM_HOST")
        comm_size = int(os.getenv("SLURM_NTASKS"))
        comm_rank = int(os.getenv("PMI_RANK"))

    # common stuff
    comm_local_rank = comm_rank % torch.cuda.device_count()
Пример #2
0
        self.model.load_state_dict(checkpoint['model_state'])
        self.iters = checkpoint['iters']
        self.startEpoch = checkpoint['epoch'] + 1
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", default=0, type=int)
    parser.add_argument("--yaml_config",
                        default='./config/cifar100.yaml',
                        type=str)
    parser.add_argument("--config", default='default', type=str)
    args = parser.parse_args()

    params = YParams(os.path.abspath(args.yaml_config), args.config)

    # setup distributed training variables and intialize cluster if using
    params['world_size'] = 1
    if 'WORLD_SIZE' in os.environ:
        params['world_size'] = int(os.environ['WORLD_SIZE'])

    params['local_rank'] = args.local_rank
    params['world_rank'] = 0
    if params['world_size'] > 1:
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend='nccl', init_method='env://')
        params['world_rank'] = dist.get_rank()
        params['global_batch_size'] = params.batch_size
        params['batch_size'] = int(params.batch_size // params['world_size'])
Пример #3
0
stage_target = "lustre"

# get env variables
comm_addr = os.getenv("SLURM_SRUN_COMM_HOST")
comm_size = int(os.getenv("SLURM_NTASKS"))
comm_rank = int(os.getenv("PMI_RANK"))
comm_local_rank = comm_rank % torch.cuda.device_count()
comm_port = "29500"
os.environ["MASTER_ADDR"] = comm_addr
os.environ["MASTER_PORT"] = comm_port

# init process group
dist.init_process_group(backend="nccl", rank=comm_rank, world_size=comm_size)

# load parameters
params = YParams("config/UNet_transpose.yaml", "default")
device = torch.device("cuda:{}".format(comm_local_rank))

# setup
dist.barrier()
tstart = time.time()
# stage in?
if stage_target == "dram":
    # copy the input file into local DRAM for each socket:
    tmpfs_root = '/dev/shm'
    #tmpfs_root = '/tmp'
    #tmpfs_root = '/run/cosmo_data'
    gpus_per_socket = torch.cuda.device_count() // 2
    socket = 0 if comm_local_rank < gpus_per_socket else 1
    new_data_path = os.path.join(tmpfs_root, 'numa{}'.format(socket),
                                 os.path.basename(params.data_path))
Пример #4
0
                    'G_state': netG.state_dict(),
                    'D_state': netD.state_dict(),
                    'optimizerG_state_dict': optimizerG.state_dict(),
                    'optimizerD_state_dict': optimizerD.state_dict()
                }, params.checkpoint_file)


if __name__ == '__main__':

    torch.backends.cudnn.benchmark = True
    if len(sys.argv) != 3:
        logging.error("Usage", sys.argv[0], "configuration_YAML_file",
                      "configuration")
        exit()

    params = YParams(os.path.abspath(sys.argv[1]), sys.argv[2])
    if not os.path.exists(params.experiment_dir):
        os.makedirs(os.path.abspath(params.experiment_dir))

    logging_utils.log_to_file(logger_name=None,
                              log_filename=os.path.join(
                                  params.experiment_dir, 'out.log'))
    params.log()
    tboard_writer = SummaryWriter(
        log_dir=os.path.join(params.experiment_dir, 'logs/'))

    params.experiment_dir = os.path.abspath(params.experiment_dir)
    params.checkpoint_file = os.path.join(params.experiment_dir, 'checkpt.tar')

    if params.seed:
        random.seed(params.seed)
Пример #5
0
            print(upars.shape)
            print(unnormalized(upars[:, 0], 'R0').shape)
            pars_norm[i * batch:(i + 1) * batch, :] = upars
            pars_orig[i * batch:(i + 1) * batch, :] = np.stack([
                unnormalized(upars[:, 0], 'R0'),
                unnormalized(upars[:, 1], 'WFHcomp'),
                unnormalized(upars[:, 2], 'WFHdays')
            ],
                                                               axis=-1)
    print("Output: shape %s, type %s, size %f MB" %
          (str(out.shape), str(out.dtype), out.nbytes / 1e6))
    with h5py.File(outname, 'w') as f:
        f.create_dataset('symptomatic3D', data=out)
        f.create_dataset('parBio', data=pars_orig)
        f.create_dataset('uniBio', data=pars_norm)
    print("Saved output to %s" % (outname))


if __name__ == '__main__':

    torch.backends.cudnn.benchmark = True
    if len(sys.argv) != 5:
        print("Usage", sys.argv[0], "configuration_YAML_file", "configuration",
              "checkpoint", "outfile")
        exit()

    params = YParams(os.path.abspath(sys.argv[1]), sys.argv[2])
    N = 64
    bs = 64
    generate(params, sys.argv[3], sys.argv[4], num=N, batch=bs)
Пример #6
0
from utils.YParams import YParams
import torch
from utils.data_loader_opt import get_data_loader_distributed
import time

#load parameters
params = YParams("config/UNet.yaml", "default")
device = torch.device("cuda:0")

train_data_loader = get_data_loader_distributed(params, 0)

it = 0
tstart = time.time()
for inp, tar in train_data_loader:
    inp = inp.to(device)
    tar = tar.to(device)
    it += 1
tend = time.time()
print("Iterations took {}s for {} iterations ({} iter/s)".format(tend - tstart, it, float(it)/(tend - tstart)))
Пример #7
0
from utils.YParams import YParams
import torch
from utils.data_loader_dali_smooth import get_data_loader_distributed
import time

#load parameters
params = YParams("config/UNet_transpose.yaml", "default")
device = torch.device("cuda:0")

# get data loader
train_data_loader = get_data_loader_distributed(params, 0)

it = 0
tstart = time.time()
for inp, tar in train_data_loader:
    it += 1
tend = time.time()
print("Iterations took {}s for {} iterations ({} iter/s)".format(
    tend - tstart, it,
    float(it) / (tend - tstart)))
Пример #8
0

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", default=0, type=int)
    parser.add_argument("--run_num", default='00', type=str)
    parser.add_argument("--yaml_config",
                        default='./config/UNet.yaml',
                        type=str)
    parser.add_argument("--config", default='default', type=str)
    args = parser.parse_args()

    run_num = args.run_num

    params = YParams(os.path.abspath(args.yaml_config), args.config)

    params.distributed = False
    if 'WORLD_SIZE' in os.environ:
        params.distributed = int(os.environ['WORLD_SIZE']) > 1

    world_rank = 0
    if params.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.gpu = args.local_rank
        world_rank = torch.distributed.get_rank()

    torch.backends.cudnn.benchmark = True
Пример #9
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--run_num", default='00', type=str)
    parser.add_argument("--yaml_config",
                        default='./config/UNet.yaml',
                        type=str)
    parser.add_argument("--config", default='default', type=str)
    parser.add_argument("--comm_mode", default='slurm-nccl', type=str)
    parser.add_argument("--io_only", action="store_true")
    parser.add_argument("--enable_amp", action="store_true")
    parser.add_argument("--global_timing", action="store_true")
    args = parser.parse_args()

    run_num = args.run_num

    params = YParams(os.path.abspath(args.yaml_config), args.config)
    params.distributed = True

    # get env variables
    if (args.comm_mode == "openmpi-nccl"):
        #use pmix server address: only works for single node
        addrport = os.getenv("PMIX_SERVER_URI2").split("//")[1]
        comm_addr = addrport.split(":")[0]
        comm_rank = int(os.getenv('OMPI_COMM_WORLD_RANK', 0))
        comm_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", 0))
    elif (args.comm_mode == "slurm-nccl"):
        comm_addr = os.getenv("SLURM_SRUN_COMM_HOST")
        comm_size = int(os.getenv("SLURM_NTASKS"))
        comm_rank = int(os.getenv("PMI_RANK"))

    # common stuff