def train_distributed(replica_id, replica_count, port, args, params): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(port) torch.distributed.init_process_group( "nccl", rank=replica_id, world_size=replica_count ) device = torch.device("cuda", replica_id) torch.cuda.set_device(device) model = DiffWave(params).to(device) model = DistributedDataParallel(model, device_ids=[replica_id]) dataset = dataset_from_path( args.data_dirs, params, is_distributed=True, spec_filename_suffix=args.spec_filename_suffix, duplicates_suffix_regex=args.duplicates_suffix_regex, ) _train_impl( replica_id, model, dataset, args, params, args.checkpoint, )
def train(args, params): dataset = dataset_from_path( args.data_dirs, params, spec_filename_suffix=args.spec_filename_suffix, duplicates_suffix_regex=args.duplicates_suffix_regex, ) model = DiffWave(params).cuda() _train_impl(0, model, dataset, args, params, args.checkpoint)
def train_distributed(replica_id, replica_count, port, args, params): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = str(port) torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count) device = torch.device('cuda', replica_id) torch.cuda.set_device(device) model = DiffWave(params).to(device) model = DistributedDataParallel(model, device_ids=[replica_id]) _train_impl(replica_id, model, dataset_from_path(args.data_dirs, params, is_distributed=True), args, params)
def train(args, params): dataset = dataset_from_path(args.data_dirs, params) model = DiffWave(params).cuda() _train_impl(0, model, dataset, args, params)