示例#1
0
def train_distributed(replica_id, replica_count, port, args, params):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(port)
    torch.distributed.init_process_group(
        "nccl", rank=replica_id, world_size=replica_count
    )

    device = torch.device("cuda", replica_id)
    torch.cuda.set_device(device)
    model = DiffWave(params).to(device)
    model = DistributedDataParallel(model, device_ids=[replica_id])
    dataset = dataset_from_path(
        args.data_dirs,
        params,
        is_distributed=True,
        spec_filename_suffix=args.spec_filename_suffix,
        duplicates_suffix_regex=args.duplicates_suffix_regex,
    )
    _train_impl(
        replica_id,
        model,
        dataset,
        args,
        params,
        args.checkpoint,
    )
示例#2
0
def train(args, params):
    dataset = dataset_from_path(
        args.data_dirs,
        params,
        spec_filename_suffix=args.spec_filename_suffix,
        duplicates_suffix_regex=args.duplicates_suffix_regex,
    )
    model = DiffWave(params).cuda()
    _train_impl(0, model, dataset, args, params, args.checkpoint)
示例#3
0
def train_distributed(replica_id, replica_count, port, args, params):
  os.environ['MASTER_ADDR'] = 'localhost'
  os.environ['MASTER_PORT'] = str(port)
  torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)

  device = torch.device('cuda', replica_id)
  torch.cuda.set_device(device)
  model = DiffWave(params).to(device)
  model = DistributedDataParallel(model, device_ids=[replica_id])
  _train_impl(replica_id, model, dataset_from_path(args.data_dirs, params, is_distributed=True), args, params)
示例#4
0
def train(args, params):
    dataset = dataset_from_path(args.data_dirs, params)
    model = DiffWave(params).cuda()
    _train_impl(0, model, dataset, args, params)