示例#1
0
def main(args):
    set_global_seeds(args.seed)
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    args.distributed = args.local_rank >= 0 or args.world_size > 1

    if args.distributed:
        args.device_ids = args.local_rank
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_init,
                                world_size=args.world_size,
                                rank=args.local_rank)
    else:
        args.device_ids = literal_eval(args.device_ids)
    main_node = not (args.distributed and torch.distributed.get_rank() > 0)

    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)

    if main_node and not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'), dummy=not main_node)

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    device = args.device
    if 'cuda' in args.device:
        main_gpu = 0
        if isinstance(args.device_ids, tuple):
            main_gpu = args.device_ids[0]
        elif isinstance(args.device_ids, int):
            main_gpu = args.device_ids
        elif isinstance(args.device_ids, dict):
            main_gpu = args.device_ids.get('input', 0)
        torch.cuda.set_device(main_gpu)
        cudnn.benchmark = True
        device = torch.device(device, main_gpu)

    dataset = getattr(datasets, args.dataset)
    args.data_config = literal_eval(args.data_config)
    args.grad_clip = literal_eval(args.grad_clip)
    train_data = dataset(args.dataset_dir, split='train', **args.data_config)
    val_data = dataset(args.dataset_dir, split='dev', **args.data_config)
    src_tok, target_tok = train_data.tokenizers.values()

    regime = literal_eval(args.optimization_config)
    model_config = literal_eval(args.model_config)

    model_config.setdefault('encoder', {})
    model_config.setdefault('decoder', {})
    if hasattr(src_tok, 'vocab_size'):
        model_config['encoder']['vocab_size'] = src_tok.vocab_size
    model_config['decoder']['vocab_size'] = target_tok.vocab_size
    model_config['vocab_size'] = model_config['decoder']['vocab_size']
    args.model_config = model_config

    model = getattr(models, args.model)(**model_config)

    model.to(device)
    batch_first = getattr(model, 'batch_first', False)

    logging.info(model)
    pack_encoder_inputs = getattr(model.encoder, 'pack_inputs', False)

    # define data loaders
    if args.distributed:
        train_sampler = DistributedSampler(train_data)
    else:
        train_sampler = None
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=train_sampler is None,
                                         sampler=train_sampler,
                                         pack=pack_encoder_inputs,
                                         max_length=args.max_length,
                                         fixed_length=args.fixed_length,
                                         num_workers=args.workers,
                                         drop_last=True)
    val_loader = val_data.get_loader(batch_size=args.eval_batch_size
                                     or args.batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     pack=pack_encoder_inputs,
                                     max_length=args.max_length,
                                     fixed_length=args.fixed_length,
                                     num_workers=args.workers)

    trainer_options = dict(grad_clip=args.grad_clip,
                           embedding_grad_clip=args.embedding_grad_clip,
                           label_smoothing=args.label_smoothing,
                           save_path=save_path,
                           save_info={
                               'tokenizers': train_data.tokenizers,
                               'config': args
                           },
                           regime=regime,
                           max_tokens=args.max_tokens,
                           chunk_batch=args.chunk_batch,
                           distributed=args.distributed,
                           local_rank=args.local_rank,
                           device_ids=args.device_ids,
                           device=device,
                           dtype=args.dtype,
                           print_freq=args.print_freq,
                           save_freq=args.save_freq,
                           eval_freq=args.eval_freq)

    trainer_options['model'] = model
    trainer = getattr(trainers, args.trainer)(**trainer_options)

    def num_parameters(model):
        return 0 if model is None else sum(
            [l.nelement() for l in model.parameters()])

    logging.info("\nEncoder - number of parameters: %d",
                 num_parameters(getattr(model, 'encoder', None)))
    logging.info("Decoder - number of parameters: %d",
                 num_parameters(getattr(model, 'decoder', None)))
    logging.info("Total number of parameters: %d\n", num_parameters(model))

    if args.uniform_init is not None:
        for param in model.parameters():
            param.data.uniform_(args.uniform_init, -args.uniform_init)

    # optionally resume from a checkpoint
    if args.evaluate:
        trainer.load(args.evaluate)
        trainer.evaluate(val_loader)
        return
    elif args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file,
                                           'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    logging.info('training regime: %s\n', regime)
    trainer.epoch = args.start_epoch

    while trainer.epoch < args.epochs:
        # train for one epoch
        trainer.run(train_loader, val_loader)
示例#2
0
def main(args):
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join('/tmp', time_stamp)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    setup_logging(os.path.join(save_path, 'log.txt'))

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    args.devices = literal_eval(args.devices)
    if 'cuda' in args.type:
        main_gpu = 0
        if isinstance(args.devices, tuple):
            main_gpu = args.devices[0]
        elif isinstance(args.devices, int):
            main_gpu = args.devices
        elif isinstance(args.devices, dict):
            main_gpu = args.devices.get('input', 0)
        torch.cuda.set_device(main_gpu)
        cudnn.benchmark = True


    checkpoint = torch.load(args.checkpoint , map_location=lambda storage, loc: storage)
    config = checkpoint['config']
    src_tok, target_tok = checkpoint['tokenizers'].values()

    args.data_config = literal_eval(args.data_config)
    dataset = getattr(datasets, args.dataset)
    args.data_config['tokenizers'] = checkpoint['tokenizers']
    val_data = dataset(args.dataset_dir, split='dev', **args.data_config)

    model = getattr(models, config.model)(**config.model_config)
    model.load_state_dict(checkpoint['state_dict'])

    batch_first = getattr(model, 'batch_first', False)

    logging.info(model)

    # define data loaders
    val_loader = val_data.get_loader(batch_size=args.batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     augment=False,
                                     pack=args.pack_encoder_inputs,
                                     max_length=args.max_length,
                                     max_tokens=args.max_tokens,
                                     num_workers=args.workers)

    trainer_options = dict(
        save_path=save_path,
        devices=args.devices,
        print_freq=args.print_freq)

    trainer_options['model'] = model
    trainer = getattr(trainers, args.trainer)(**trainer_options)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    model.type(args.type)

    trainer.evaluate(val_loader)
示例#3
0
def main(args):
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join('/tmp', time_stamp)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    setup_logging(os.path.join(save_path, 'log.txt'))

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    args.devices = literal_eval(args.devices)
    if 'cuda' in args.type:
        main_gpu = 0
        if isinstance(args.devices, tuple):
            main_gpu = args.devices[0]
        elif isinstance(args.devices, int):
            main_gpu = args.devices
        elif isinstance(args.devices, dict):
            main_gpu = args.devices.get('input', 0)
        torch.cuda.set_device(main_gpu)
        cudnn.benchmark = True

    checkpoint = torch.load(args.checkpoint,
                            map_location=lambda storage, loc: storage)
    config = checkpoint['config']
    src_tok, target_tok = checkpoint['tokenizers'].values()

    args.data_config = literal_eval(args.data_config)
    dataset = getattr(datasets, args.dataset)
    args.data_config['tokenizers'] = checkpoint['tokenizers']
    val_data = dataset(args.dataset_dir, split='dev', **args.data_config)

    model = getattr(models, config.model)(**config.model_config)
    model.load_state_dict(checkpoint['state_dict'])

    batch_first = getattr(model, 'batch_first', False)

    logging.info(model)

    # define data loaders
    val_loader = val_data.get_loader(batch_size=args.batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     augment=False,
                                     pack=args.pack_encoder_inputs,
                                     max_length=args.max_length,
                                     max_tokens=args.max_tokens,
                                     num_workers=args.workers)

    trainer_options = dict(save_path=save_path,
                           devices=args.devices,
                           print_freq=args.print_freq)

    trainer_options['model'] = model
    trainer = getattr(trainers, args.trainer)(**trainer_options)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    model.type(args.type)

    trainer.evaluate(val_loader)
示例#4
0
def main(args):
    # set up save path
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # set up logging
    setup_logging(os.path.join(save_path, 'log_%s.txt' % time_stamp))

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    # set up cuda
    args.devices = literal_eval(args.devices)
    if 'cuda' in args.type:
        main_gpu = 0
        if isinstance(args.devices, tuple):
            main_gpu = args.devices[0]
        elif isinstance(args.devices, int):
            main_gpu = args.devices
        elif isinstance(args.devices, dict):
            main_gpu = args.devices.get('input', 0)
        torch.cuda.set_device(main_gpu)
        cudnn.benchmark = True
    # set dataset
    dataset = getattr(datasets, args.dataset)
    args.data_config = literal_eval(args.data_config)
    train_data = dataset(args.dataset_dir, split='train', **args.data_config)
    val_data = dataset(args.dataset_dir, split='dev', **args.data_config)
    src_tok, target_tok = train_data.tokenizers.values()

    regime = literal_eval(args.optimization_config)
    model_config = literal_eval(args.model_config)

    model_config.setdefault('encoder', {})
    model_config.setdefault('decoder', {})
    if hasattr(src_tok, 'vocab_size'):
        model_config['encoder']['vocab_size'] = src_tok.vocab_size
    model_config['decoder']['vocab_size'] = target_tok.vocab_size
    model_config['vocab_size'] = model_config['decoder']['vocab_size']
    args.model_config = model_config

    model = getattr(models, args.model)(**model_config)
    batch_first = getattr(model, 'batch_first', False)

    logging.info(model)

    # define data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         pack=args.pack_encoder_inputs,
                                         max_length=args.max_length,
                                         num_workers=args.workers)
    val_loader = val_data.get_loader(batch_size=args.batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     pack=args.pack_encoder_inputs,
                                     max_length=args.max_length,
                                     num_workers=args.workers)

    trainer_options = dict(grad_clip=args.grad_clip,
                           embedding_grad_clip=args.embedding_grad_clip,
                           save_path=save_path,
                           save_info={
                               'tokenizers': train_data.tokenizers,
                               'config': args
                           },
                           regime=regime,
                           devices=args.devices,
                           print_freq=args.print_freq,
                           save_freq=args.save_freq,
                           eval_freq=args.eval_freq)

    trainer_options['model'] = model
    trainer = getattr(trainers, args.trainer)(**trainer_options)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    model.type(args.type)
    if args.uniform_init is not None:
        for param in model.parameters():
            param.data.uniform_(args.uniform_init, -args.uniform_init)

    # optionally resume from a checkpoint
    if args.evaluate:
        trainer.load(args.evaluate)
    elif args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            results.load(os.path.join(checkpoint_file, 'results.csv'))
            checkpoint_file = os.path.join(checkpoint_file,
                                           'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    logging.info('training regime: %s', regime)
    trainer.epoch = args.start_epoch

    while trainer.epoch < args.epochs:
        # train for one epoch
        trainer.run(train_loader, val_loader)
示例#5
0
def main(args):
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    args.device_ids = literal_eval(args.device_ids)
    device = args.device
    if 'cuda' in args.device:
        main_gpu = 0
        if isinstance(args.device_ids, tuple):
            main_gpu = args.device_ids[0]
        elif isinstance(args.device_ids, int):
            main_gpu = args.device_ids
        elif isinstance(args.device_ids, dict):
            main_gpu = args.device_ids.get('input', 0)
        torch.cuda.set_device(main_gpu)
        cudnn.benchmark = True
        device = torch.device(device, main_gpu)

    dataset = getattr(datasets, args.dataset)
    args.data_config = literal_eval(args.data_config)
    args.grad_clip = literal_eval(args.grad_clip)
    train_data = dataset(args.dataset_dir, split='train', **args.data_config)
    val_data = dataset(args.dataset_dir, split='dev', **args.data_config)
    src_tok, target_tok = train_data.tokenizers.values()

    regime = literal_eval(args.optimization_config)
    model_config = literal_eval(args.model_config)

    model_config.setdefault('encoder', {})
    model_config.setdefault('decoder', {})
    if hasattr(src_tok, 'vocab_size'):
        model_config['encoder']['vocab_size'] = src_tok.vocab_size
    model_config['decoder']['vocab_size'] = target_tok.vocab_size
    model_config['vocab_size'] = model_config['decoder']['vocab_size']
    args.model_config = model_config

    model = getattr(models, args.model)(**model_config)
    model.to(device)
    batch_first = getattr(model, 'batch_first', False)

    logging.info(model)
    pack_encoder_inputs = getattr(model.encoder, 'pack_inputs', False)

    # define data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         augment=True,
                                         pack=pack_encoder_inputs,
                                         max_length=args.max_length,
                                         max_tokens=args.max_tokens,
                                         num_workers=args.workers)
    val_loader = val_data.get_loader(batch_size=args.batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     augment=False,
                                     pack=pack_encoder_inputs,
                                     max_length=args.max_length,
                                     max_tokens=args.max_tokens,
                                     num_workers=args.workers)

    trainer_options = dict(
        grad_clip=args.grad_clip,
        embedding_grad_clip=args.embedding_grad_clip,
        label_smoothing=args.label_smoothing,
        save_path=save_path,
        save_info={'tokenizers': train_data.tokenizers,
                   'config': args},
        regime=regime,
        limit_num_tokens=args.limit_num_tokens,
        device_ids=args.device_ids,
        device=device,
        dtype=args.dtype,
        print_freq=args.print_freq,
        save_freq=args.save_freq,
        eval_freq=args.eval_freq)

    trainer_options['model'] = model
    trainer = getattr(trainers, args.trainer)(**trainer_options)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    if args.uniform_init is not None:
        for param in model.parameters():
            param.data.uniform_(args.uniform_init, -args.uniform_init)

    # optionally resume from a checkpoint
    if args.evaluate:
        trainer.load(args.evaluate)
        trainer.evaluate(val_loader)
        return
    elif args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    logging.info('training regime: %s', regime)
    trainer.epoch = args.start_epoch

    while trainer.epoch < args.epochs:
        # train for one epoch
        trainer.run(train_loader, val_loader)