Пример #1
0
def main(args):
    print(args)

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(
            args.data, splits, args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(
            args.data, splits, args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst
    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split])))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))
    print('| num. model params: {}'.format(sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available
    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch))
        if batch_offset == 0:
            trainer.lr_step(epoch)
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, trainer, dataset, epoch, batch_offset)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, trainer, dataset, subset, epoch)
            if k == 0:
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(epoch, val_loss)

                # save checkpoint
                if not args.no_save:
                    save_checkpoint(trainer, args, epoch, 0, val_loss)

        epoch += 1
        batch_offset = 0
    train_meter.stop()

    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Пример #2
0
def main(args):
    print(args)

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang,
                                    args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits,
                                             args.source_lang,
                                             args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst
    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    if 0.0 < args.rank_scale < 1.0:
        patch_transformer(args, model)
        if args.wd2fd:
            no_decay, skiplist = [
                'fc1', 'fc2', 'embed_tokens', 'embed_positions', 'out_embed'
            ], []
        else:
            no_decay, skiplist = [], [
                'fc1', 'fc2', 'embed_tokens', 'embed_positions', 'out_embed'
            ]
    else:
        no_decay, skiplist = [], []
    spectral_init(args, model)

    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    no_decay, skiplist = [], []
    if args.wd2fd_quekey:
        no_decay.extend(['_query.weight', '_key.weight'])
    else:
        skiplist.append('quekey')
    if args.wd2fd_outval:
        no_decay.extend(['_value.weight', 'output_perform.weight'])
    else:
        skiplist.append('outval')

    trainer = Trainer(args,
                      model,
                      criterion,
                      skiplist=skiplist,
                      no_decay=no_decay)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available
    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(
            checkpoint_path, epoch))
        if batch_offset == 0:
            trainer.lr_step(epoch)
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    if args.distributed_rank <= 0:
        writer = SummaryWriter(args.save_dir)
        with open(os.path.join(args.save_dir, 'args.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
    else:
        writer = SummaryWriter(
            os.path.join(args.save_dir, str(args.distributed_rank)))

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:

        if args.distributed_rank <= 0:
            writer.add_scalar('hyper/lr', lr, epoch)
            for form in ['QueKey', 'OutVal']:
                frobnorm, nucnorm, bound, nonorth = [], [], [], []
                for module in model.modules():
                    if hasattr(module, form.lower()):
                        U, VT = getattr(module, form.lower()).get_UVT()
                        for u, vt in zip(U, VT):
                            frobnorm.append(frobenius_norm(u, vt))
                            nucnorm.append(
                                torch.norm(torch.matmul(u, vt), 'nuc'))
                            bound.append(
                                (u.pow(2).sum() + vt.pow(2).sum()) / 2.)
                            nonorth.append(sum(non_orthogonality(u, vt)) / 2.)
                writer.add_scalar('FrobNorm/' + form,
                                  sum(frobnorm) / len(frobnorm), epoch)
                writer.add_scalar('NucNorm/' + form,
                                  sum(nucnorm) / len(nucnorm), epoch)
                writer.add_scalar('NucNorm/' + form + '-Bound',
                                  sum(bound) / len(bound), epoch)
                writer.add_scalar('NonOrth/' + form,
                                  sum(nonorth) / len(nonorth), epoch)
            frobnorm, nucnorm, bound, nonorth = [], [], [], []
            for name, module in model.named_modules():
                if not any(
                        block in name for block in
                    ['embed', '_query', '_key', '_value', 'output_perform']):
                    if hasattr(module,
                               'frobgrad') and not hasattr(module, 'get_UVT'):
                        U, VT = module.U.data, module.VT.data
                        frobnorm.append(frobenius_norm(U, VT))
                        nucnorm.append(torch.norm(torch.matmul(U, VT), 'nuc'))
                        nonorth.append(sum(non_orthogonality(U, VT)) / 2.)
                        bound.append((U.pow(2).sum() + VT.pow(2).sum()) / 2.)
                    elif hasattr(module, 'weight'):
                        frobnorm.append(torch.norm(module.weight.data))
                        nucnorm.append(torch.norm(module.weight.data, 'nuc'))
            writer.add_scalar('FrobNorm/Linear',
                              sum(frobnorm) / len(frobnorm), epoch)
            writer.add_scalar('NucNorm/Linear',
                              sum(nucnorm) / len(nucnorm), epoch)
            if nonorth:
                writer.add_scalar('NucNorm/Linear-Bound',
                                  sum(bound) / len(bound), epoch)
                writer.add_scalar('NonOrth/Linear',
                                  sum(nonorth) / len(nonorth), epoch)

        # train for one epoch
        train(args, trainer, dataset, epoch, batch_offset)

        # evaluate on validate set
        if epoch % args.validate_interval == 0:
            for k, subset in enumerate(args.valid_subset.split(',')):
                val_loss = validate(args, trainer, dataset, subset, epoch)
                if k == 0:
                    # only use first validation loss to update the learning schedule
                    lr = trainer.lr_step(epoch, val_loss)

                    # save checkpoint
                    if not args.no_save:
                        save_checkpoint(trainer, args, epoch, 0, val_loss)
            for k in ['loss', 'nll_loss']:
                writer.add_scalar('valid/' + k,
                                  trainer.meters['valid_' + k].avg, epoch)
                writer.add_scalar('train/' + k,
                                  trainer.meters['train_' + k].avg, epoch)
        else:
            lr = trainer.lr_step(epoch)

        epoch += 1
        batch_offset = 0

        if trainer.get_num_updates() >= max_update:
            break
    train_meter.stop()

    print('| done training in {:.1f} seconds'.format(train_meter.sum))
    writer.flush()
    newpar = sum(p.numel() for p in model.parameters())
    if 0.0 < args.rank_scale < 1.0:
        args.rank_scale = 1.0
        origpar = sum(p.numel() for p in models.build_model(
            args, dataset.src_dict, dataset.dst_dict).parameters())
    else:
        origpar = newpar
    if args.distributed_rank <= 0:
        with open(os.path.join(args.save_dir, 'results.json'), 'w') as f:
            json.dump(
                {
                    'final validation loss':
                    trainer.meters['valid_nll_loss'].avg,
                    'original parameter count': origpar,
                    'compressed parameter count': newpar,
                    'compression ratio': newpar / origpar
                },
                f,
                indent=4)
Пример #3
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens',
                              default=6000,
                              type=int,
                              metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument('--max-sentences',
                              type=int,
                              metavar='N',
                              help='maximum number of sentences in a batch')
    dataset_args.add_argument(
        '--train-subset',
        default='train',
        metavar='SPLIT',
        choices=['train', 'valid', 'test'],
        help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument(
        '--valid-subset',
        default='valid',
        metavar='SPLIT',
        help='comma separated list ofdata subsets '
        ' to use for validation (train, valid, valid1,test, test1)')
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)

    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'simple'

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang,
                                    args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits,
                                             args.source_lang,
                                             args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(args)
    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    num_gpus = torch.cuda.device_count()

    print(
        '| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'
        .format(num_gpus, args.max_tokens, args.max_sentences))

    # Build model and criterion
    model = utils.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))

    # The max number of positions can be different for train and valid
    # e.g., RNNs may support more positions at test time than seen in training
    max_positions_train = (args.max_source_positions,
                           args.max_target_positions)
    max_positions_valid = (min(args.max_source_positions,
                               model.max_encoder_positions()),
                           min(args.max_target_positions,
                               model.max_decoder_positions()))

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model, criterion)

    # Load the latest checkpoint if one is available
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(
            checkpoint_path, epoch))
        if batch_offset == 0:
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, dataset, max_positions_train,
              num_gpus)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, dataset,
                                max_positions_valid, subset, num_gpus)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    save_checkpoint(trainer, args, epoch, 0, val_loss)
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Stop multiprocessing
    trainer.stop()
Пример #4
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument('--max-sentences', type=int, metavar='N',
                              help='maximum number of sentences in a batch')
    dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT',
                              choices=['train', 'valid', 'test'],
                              help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT',
                              help='comma separated list of data subsets '
                                   ' to use for validation (train, valid, valid1,test, test1)')
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)

    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'simple'

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(args)
    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split])))

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    num_gpus = torch.cuda.device_count()

    print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format(
        num_gpus, args.max_tokens, args.max_sentences))

    # Build model and criterion
    model = utils.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))

    # The max number of positions can be different for train and valid
    # e.g., RNNs may support more positions at test time than seen in training
    max_positions_train = (args.max_source_positions, args.max_target_positions)
    max_positions_valid = (
        min(args.max_source_positions, model.max_encoder_positions()),
        min(args.max_target_positions, model.max_decoder_positions())
    )

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model, criterion)

    # Load the latest checkpoint if one is available
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch))
        if batch_offset == 0:
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    save_checkpoint(trainer, args, epoch, 0, val_loss)
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Stop multiprocessing
    trainer.stop()