def main(args): if not torch.cuda.is_available(): utils.highlight_msg("CPU only training is unsupported.") return None os.makedirs(args.dir+'/ckpt', exist_ok=True) setattr(args, 'ckptpath', args.dir+'/ckpt') if os.listdir(args.ckptpath) != [] and not args.debug and args.resume is None: utils.highlight_msg( f"ERROR:\nCheckpoint path `{args.ckptpath}` is not empty!\nRefuse to run the experiment, otherwise previous files would be overwritten.") raise AssertionError ngpus_per_node = torch.cuda.device_count() args.world_size = ngpus_per_node * args.world_size print(f"Global number of GPUs: {args.world_size}") mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
def main(args): if not torch.cuda.is_available(): utils.highlight_msg("Using CPU.") single_worker('cpu', args.nj, args) return None ngpus_per_node = torch.cuda.device_count() args.world_size = ngpus_per_node * args.world_size print(f"> Global number of GPUs: {args.world_size}") num_jobs = args.nj if num_jobs <= ngpus_per_node: utils.highlight_msg( f"Number of jobs (--nj={num_jobs}) is too small.\nUse only one GPU for avoiding errors.") single_worker("cuda:0", num_jobs, args) return None inferset = InferDataset(args.input_scp) res = len(inferset) % args.world_size if res == 0: mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, num_jobs)) return None else: # This is a hack for non-divisible length of data to number of GPUs utils.highlight_msg("Using hack to deal with undivisible data length.") mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, num_jobs-1)) single_worker("cuda:0", 1, args, len(inferset)-res)
def build_model(args, configuration, train=True) -> nn.Module: netconfigs = configuration['net'] net_kwargs = netconfigs['kwargs'] net = getattr(model_zoo, netconfigs['type']) if not train: infer_model = net(**net_kwargs) return infer_model if 'lossfn' not in netconfigs: lossfn = 'crf' utils.highlight_msg( "Warning: not specified \'lossfn\' in configuration.\nDefaultly set to \'crf\'") else: lossfn = netconfigs['lossfn'] if 'lamb' not in netconfigs: lamb = 0.01 if lossfn == 'crf': utils.highlight_msg( "Warning: not specified \'lamb\' in configuration.\nDefaultly set to 0.01") else: lamb = netconfigs['lamb'] if 'specaug' not in netconfigs: specaug = None if args.rank == 0: utils.highlight_msg("Disable SpecAug.") else: specaug = SpecAug(**netconfigs['specaug']) setattr(args, 'iscrf', lossfn == 'crf') model = CAT_Model(net, lossfn, lamb, net_kwargs, specaug) torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) return model
parser.add_argument("-f", "--format", type=str, choices=["hdf5", "pickle"], default="pickle") parser.add_argument("-W", "--warning", action="store_true", default=False) parser.add_argument("scp", type=str) parser.add_argument("label", type=str) parser.add_argument("weight", type=str) parser.add_argument("output_path", type=str) args = parser.parse_args() if args.warning: utils.highlight_msg( "Calculation of CTC loss requires the input sequence to be longer than ctc_len(labels).\nCheck that in 'ctc-crf/convert_to.py' if your model does subsampling on seq.\nMake your modify at line 'if feature.shape[0] < ctc_len(label):' to filter unqualified seq.\nIf you have already done, ignore this." ) label_dict = {} with open(args.label, 'r') as fi: lines = fi.readlines() for line in lines: sp = line.split() label_dict[sp[0]] = np.asarray([int(x) for x in sp[1:]]) weight_dict = {} with open(args.weight, 'r') as fi: lines = fi.readlines() for line in lines: sp = line.split() weight_dict[sp[0]] = np.asarray([float(sp[1])])
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu args.rank = args.rank * ngpus_per_node + gpu print(f"Use GPU: local[{args.gpu}] | global[{args.rank}]") dist.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) args.batch_size = args.batch_size // ngpus_per_node print("> Data prepare") if args.h5py: data_format = "hdf5" utils.highlight_msg("H5py reading might cause error with Multi-GPUs.") Dataset = DataSet.SpeechDataset else: data_format = "pickle" Dataset = DataSet.SpeechDatasetPickle tr_set = Dataset( f"{args.data}/{data_format}/tr.{data_format}") test_set = Dataset( f"{args.data}/{data_format}/cv.{data_format}") print("Data prepared.") train_sampler = DistributedSampler(tr_set) test_sampler = DistributedSampler(test_set) test_sampler.set_epoch(1) trainloader = DataLoader( tr_set, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=DataSet.sortedPadCollate()) testloader = DataLoader( test_set, batch_size=args.batch_size, shuffle=(test_sampler is None), num_workers=args.workers, pin_memory=True, sampler=test_sampler, collate_fn=DataSet.sortedPadCollate()) logger = OrderedDict({ 'log_train': ['epoch,loss,loss_real,net_lr,time'], 'log_eval': ['loss_real,time'] }) manager = utils.Manager(logger, build_model, args) # get GPU info gpu_info = utils.gather_all_gpu_info(args.gpu) if args.rank == 0: print("> Model built.") print("Model size:{:.2f}M".format( utils.count_parameters(manager.model)/1e6)) utils.gen_readme(args.dir+'/readme.md', model=manager.model, gpu_info=gpu_info) # init ctc-crf, args.iscrf is set in build_model if args.iscrf: gpus = torch.IntTensor([args.gpu]) ctc_crf_base.init_env(f"{args.data}/den_meta/den_lm.fst", gpus) # training manager.run(train_sampler, trainloader, testloader, args) if args.iscrf: ctc_crf_base.release_env(gpus)
help="Directory to save the log and model files.") parser.add_argument('-p', '--print-freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='tcp://127.0.0.1:13943', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') args = parser.parse_args() SEED = args.seed torch.manual_seed(SEED) torch.cuda.manual_seed_all(SEED) np.random.seed(SEED) torch.backends.cudnn.deterministic = True if args.debug: utils.highlight_msg("Debugging.") main(args)