示例#1
0
def main(args):
    if not torch.cuda.is_available():
        utils.highlight_msg("CPU only training is unsupported.")
        return None

    os.makedirs(args.dir+'/ckpt', exist_ok=True)
    setattr(args, 'ckptpath', args.dir+'/ckpt')
    if os.listdir(args.ckptpath) != [] and not args.debug and args.resume is None:
        utils.highlight_msg(
            f"ERROR:\nCheckpoint path `{args.ckptpath}` is not empty!\nRefuse to run the experiment, otherwise previous files would be overwritten.")
        raise AssertionError

    ngpus_per_node = torch.cuda.device_count()
    args.world_size = ngpus_per_node * args.world_size
    print(f"Global number of GPUs: {args.world_size}")
    mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
示例#2
0
def main(args):
    if not torch.cuda.is_available():
        utils.highlight_msg("Using CPU.")
        single_worker('cpu', args.nj, args)
        return None

    ngpus_per_node = torch.cuda.device_count()
    args.world_size = ngpus_per_node * args.world_size
    print(f"> Global number of GPUs: {args.world_size}")
    num_jobs = args.nj
    if num_jobs <= ngpus_per_node:
        utils.highlight_msg(
            f"Number of jobs (--nj={num_jobs}) is too small.\nUse only one GPU for avoiding errors.")
        single_worker("cuda:0", num_jobs, args)
        return None

    inferset = InferDataset(args.input_scp)
    res = len(inferset) % args.world_size

    if res == 0:
        mp.spawn(main_worker, nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args, num_jobs))
        return None
    else:
        # This is a hack for non-divisible length of data to number of GPUs
        utils.highlight_msg("Using hack to deal with undivisible data length.")
        mp.spawn(main_worker, nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args, num_jobs-1))
        single_worker("cuda:0", 1, args, len(inferset)-res)
示例#3
0
def build_model(args, configuration, train=True) -> nn.Module:

    netconfigs = configuration['net']
    net_kwargs = netconfigs['kwargs']
    net = getattr(model_zoo, netconfigs['type'])

    if not train:
        infer_model = net(**net_kwargs)
        return infer_model

    if 'lossfn' not in netconfigs:
        lossfn = 'crf'
        utils.highlight_msg(
            "Warning: not specified \'lossfn\' in configuration.\nDefaultly set to \'crf\'")
    else:
        lossfn = netconfigs['lossfn']

    if 'lamb' not in netconfigs:
        lamb = 0.01
        if lossfn == 'crf':
            utils.highlight_msg(
                "Warning: not specified \'lamb\' in configuration.\nDefaultly set to 0.01")
    else:
        lamb = netconfigs['lamb']

    if 'specaug' not in netconfigs:
        specaug = None
        if args.rank == 0:
            utils.highlight_msg("Disable SpecAug.")
    else:
        specaug = SpecAug(**netconfigs['specaug'])

    setattr(args, 'iscrf', lossfn == 'crf')
    model = CAT_Model(net, lossfn, lamb, net_kwargs, specaug)

    torch.cuda.set_device(args.gpu)
    model.cuda(args.gpu)
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.gpu])
    return model
示例#4
0
    parser.add_argument("-f",
                        "--format",
                        type=str,
                        choices=["hdf5", "pickle"],
                        default="pickle")
    parser.add_argument("-W", "--warning", action="store_true", default=False)
    parser.add_argument("scp", type=str)
    parser.add_argument("label", type=str)
    parser.add_argument("weight", type=str)
    parser.add_argument("output_path", type=str)

    args = parser.parse_args()

    if args.warning:
        utils.highlight_msg(
            "Calculation of CTC loss requires the input sequence to be longer than ctc_len(labels).\nCheck that in 'ctc-crf/convert_to.py' if your model does subsampling on seq.\nMake your modify at line 'if feature.shape[0] < ctc_len(label):' to filter unqualified seq.\nIf you have already done, ignore this."
        )

    label_dict = {}
    with open(args.label, 'r') as fi:
        lines = fi.readlines()
        for line in lines:
            sp = line.split()
            label_dict[sp[0]] = np.asarray([int(x) for x in sp[1:]])

    weight_dict = {}
    with open(args.weight, 'r') as fi:
        lines = fi.readlines()
        for line in lines:
            sp = line.split()
            weight_dict[sp[0]] = np.asarray([float(sp[1])])
示例#5
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    args.rank = args.rank * ngpus_per_node + gpu
    print(f"Use GPU: local[{args.gpu}] | global[{args.rank}]")

    dist.init_process_group(
        backend=args.dist_backend, init_method=args.dist_url,
        world_size=args.world_size, rank=args.rank)

    args.batch_size = args.batch_size // ngpus_per_node

    print("> Data prepare")
    if args.h5py:
        data_format = "hdf5"
        utils.highlight_msg("H5py reading might cause error with Multi-GPUs.")
        Dataset = DataSet.SpeechDataset
    else:
        data_format = "pickle"
        Dataset = DataSet.SpeechDatasetPickle

    tr_set = Dataset(
        f"{args.data}/{data_format}/tr.{data_format}")
    test_set = Dataset(
        f"{args.data}/{data_format}/cv.{data_format}")
    print("Data prepared.")

    train_sampler = DistributedSampler(tr_set)
    test_sampler = DistributedSampler(test_set)
    test_sampler.set_epoch(1)

    trainloader = DataLoader(
        tr_set, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True,
        sampler=train_sampler, collate_fn=DataSet.sortedPadCollate())

    testloader = DataLoader(
        test_set, batch_size=args.batch_size, shuffle=(test_sampler is None),
        num_workers=args.workers, pin_memory=True,
        sampler=test_sampler, collate_fn=DataSet.sortedPadCollate())

    logger = OrderedDict({
        'log_train': ['epoch,loss,loss_real,net_lr,time'],
        'log_eval': ['loss_real,time']
    })
    manager = utils.Manager(logger, build_model, args)

    # get GPU info
    gpu_info = utils.gather_all_gpu_info(args.gpu)

    if args.rank == 0:
        print("> Model built.")
        print("Model size:{:.2f}M".format(
            utils.count_parameters(manager.model)/1e6))

        utils.gen_readme(args.dir+'/readme.md',
                         model=manager.model, gpu_info=gpu_info)

    # init ctc-crf, args.iscrf is set in build_model
    if args.iscrf:
        gpus = torch.IntTensor([args.gpu])
        ctc_crf_base.init_env(f"{args.data}/den_meta/den_lm.fst", gpus)

    # training
    manager.run(train_sampler, trainloader, testloader, args)

    if args.iscrf:
        ctc_crf_base.release_env(gpus)
示例#6
0
                        help="Directory to save the log and model files.")

    parser.add_argument('-p', '--print-freq', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')

    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--rank', default=-1, type=int,
                        help='node rank for distributed training')
    parser.add_argument('--dist-url', default='tcp://127.0.0.1:13943', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str,
                        help='distributed backend')
    parser.add_argument('--world-size', default=-1, type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--gpu', default=None, type=int,
                        help='GPU id to use.')

    args = parser.parse_args()

    SEED = args.seed
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    torch.backends.cudnn.deterministic = True

    if args.debug:
        utils.highlight_msg("Debugging.")

    main(args)