Exemplo n.º 1
0
def multi_main(args):
    """ Spawns 1 process per GPU """
    init_logger()

    nb_gpu = args.world_size
    mp = torch.multiprocessing.get_context('spawn')

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing.
    procs = []
    for i in range(nb_gpu):
        device_id = i
        procs.append(
            mp.Process(target=run,
                       args=(
                           args,
                           device_id,
                           error_queue,
                       ),
                       daemon=True))
        procs[i].start()
        logger.info(" Starting process pid: %d  " % procs[i].pid)
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
Exemplo n.º 2
0
def train(args, device_id):
    # Start logger.
    init_logger(args.log_file)

    # Configure training device.
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)

    # Configure manual seed.
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    # Set CUDA device.
    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    # Configure manual seed.
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    # Dataloader used for training.
    def train_iter_fct():
        return Dataloader(args,
                          load_dataset(args, 'train', shuffle=True),
                          args.batch_size,
                          device,
                          shuffle=True,
                          is_test=False)

    # Build the model.
    model = Summarizer(args, device, load_pretrained=True)

    # Configure the checkpoint.
    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)

        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if k in model_flags:
                setattr(args, k, opt[k])

        model.load_cp(checkpoint)
        optim = builder.build_optim(args, model, checkpoint)
    else:
        optim = builder.build_optim(args, model, None)
    logger.info(model)

    # Train the model
    trainer = build_trainer(args, device_id, model, optim)
    trainer.train(train_iter_fct, args.train_steps)
Exemplo n.º 3
0
    # Batch configuration.
    parser.add_argument("-shard_size", default=2000, type=int)
    parser.add_argument('-min_nsents', default=3, type=int)
    parser.add_argument('-max_nsents', default=100, type=int)
    parser.add_argument('-min_src_ntokens', default=5, type=int)
    parser.add_argument('-max_src_ntokens', default=1500, type=int)

    parser.add_argument("-lower",
                        type=str2bool,
                        nargs='?',
                        const=False,
                        default=False)
    parser.add_argument("-log_file", default='../logs/preprocess.log')
    parser.add_argument(
        '-dataset',
        default='',
        help='train, valid or test, default will process all datasets')
    parser.add_argument('-n_cpus', default=2, type=int)

    # Stanford CoreNLP.
    parser.add_argument("-tokenizer_dir",
                        type=str,
                        default="../stanford-corenlp/")
    parser.add_argument("-tokenizer_date", type=str, default="2018-10-05")
    parser.add_argument("-tokenizer_ver", type=str, default="3.9.2")

    args = parser.parse_args()
    init_logger(args.log_file)

    process(args)