Exemplo n.º 1
0
    def __init__(self,
                 actor_critic,
                 clip_param,
                 ppo_epoch,
                 num_mini_batch,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 max_grad_norm=None,
                 use_clipped_value_loss=True,
                 optimizer='adam',
                 beta1=0.0,
                 beta2=0.999):
        # betas not passed to optimizers
        self.actor_critic = actor_critic

        self.clip_param = clip_param
        self.ppo_epoch = ppo_epoch
        self.num_mini_batch = num_mini_batch

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm
        self.use_clipped_value_loss = use_clipped_value_loss
        if optimizer == 'adam':
            print("using adam optimizer!")
            self.optimizer = optim.Adam(actor_critic.parameters(),
                                        lr=lr,
                                        eps=eps,
                                        betas=(0.0, 0.999))

        elif optimizer == 'lamb':
            print("using lamb optimizer!")
            self.optimizer = Lamb(actor_critic.parameters(),
                                  lr=lr,
                                  eps=eps,
                                  betas=(0.0, 0.999))

        elif optimizer == 'sgd':
            print("using SGD optimizer!")
            self.optimizer = optim.SGD(actor_critic.parameters(),
                                       lr=lr,
                                       momentum=0.0)

        elif optimizer == 'nero':
            print("using nero optimizer!")
            self.optimizer = Nero(actor_critic.parameters(), lr=lr)
Exemplo n.º 2
0
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        """
        Setup the optimizer and the learning rate scheduler.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
        """
        if self.optimizer is None:
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [
                        p for n, p in self.model.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    self.args.weight_decay,
                },
                {
                    "params": [
                        p for n, p in self.model.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ]
            if self.args.optimizer_str == "adamw":
                self.optimizer = AdamW(
                    optimizer_grouped_parameters,
                    lr=self.args.learning_rate,
                    betas=(self.args.adam_beta1, self.args.adam_beta2),
                    eps=self.args.adam_epsilon,
                )
            elif self.args.optimizer_str == "lamb":
                self.optimizer = Lamb(optimizer_grouped_parameters,
                                      lr=self.args.learning_rate,
                                      eps=self.args.adam_epsilon)
            else:
                raise NotImplementedError("Optimizer must be adamw or lamb")
        if self.lr_scheduler is None:
            self.lr_scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=self.args.warmup_steps,
                num_training_steps=num_training_steps)
Exemplo n.º 3
0
def model_fn(config):
    max_seq_length = config['max_seq_length']
    input_ids = keras.Input(shape=(max_seq_length, ), name='input_ids')
    attention_mask = keras.Input(shape=(max_seq_length, ),
                                 name='attention_mask')
    segment_ids = keras.Input(shape=(max_seq_length, ), name='segment_ids')

    albert = Bert(config)
    first_tokens, output = albert([input_ids, attention_mask, segment_ids])

    # albert_model = keras.Model(inputs=[input_ids, attention_mask, segment_ids], outputs=output)
    albert_model = keras.Model(inputs={
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'segment_ids': segment_ids
    },
                               outputs=[first_tokens, output])

    lm_predictor = PreTrainLMPredictor(
        config['input_hidden'],
        config['vocab_size'],
        max_seq_length,
    )
    lm_predict_outputs = lm_predictor(
        [output, albert.embedding.embeddings, albert.projection])
    next_sentence_predictor = PreTrainNextSentencePredictor(2)
    next_sentence_predict_outputs = next_sentence_predictor(first_tokens)

    classifier_model = keras.Model(
        inputs={
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'segment_ids': segment_ids
        },
        outputs=[lm_predict_outputs, next_sentence_predict_outputs])

    # Optimizer
    lamb_optimizer = Lamb()
    return classifier_model, albert_model, lamb_optimizer
def main(lr=0.1):
    global best_acc
    args.lr = lr
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # Model
    print('==> Building model..')
    # net = VGG('VGG19')
    # net = ResNet18()
    # net = PreActResNet18()
    # net = GoogLeNet()
    # net = DenseNet121()
    # net = ResNeXt29_2x64d()
    # net = MobileNet()
    # net = MobileNetV2()
    # net = DPN92()
    # net = ShuffleNetG2()
    # net = SENet18()
    # net = ShuffleNetV2(1)
    # net = EfficientNetB0()
    # net = RegNetX_200MF()
    net = ResNet50()
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth'

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(ckpt)
        net.load_state_dict(checkpoint['net'])
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']

    criterion = nn.CrossEntropyLoss()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
    if args.optimizer.lower() == 'sgdwm':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adam':
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adagrad':
        optimizer = optim.Adagrad(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'radam':
        from radam import RAdam
        optimizer = RAdam(net.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lars':  #no tensorboardX
        from lars import LARS
        optimizer = LARS(net.parameters(),
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lamb':
        from lamb import Lamb
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'novograd':
        from novograd import NovoGrad
        optimizer = NovoGrad(net.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
    else:
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
    # lr_scheduler = LambdaLR(optimizer,lrs)
    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
    train_acc = []
    valid_acc = []

    # Training
    def train(epoch):
        print('\nEpoch: %d' % epoch)
        net.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            print(batch_idx)
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(100. * correct / total)
        train_acc.append(correct / total)

    def test(epoch):
        global best_acc
        net.eval()
        test_loss = 0
        correct = 0
        total = 0
        print('test')
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Save checkpoint.
        acc = 100. * correct / total
        print(acc)
        valid_acc.append(correct / total)

        if acc > best_acc:
            print('Saving..')
            state = {
                'net': net.state_dict(),
                'acc': acc,
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, ckpt)
            best_acc = acc

    for epoch in range(200):
        if epoch in args.lr_decay:
            checkpoint = torch.load(ckpt)
            net.load_state_dict(checkpoint['net'])
            best_acc = checkpoint['acc']
            args.lr *= 0.1
            if args.optimizer.lower() == 'sgd':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
            if args.optimizer.lower() == 'sgdwm':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adam':
                optimizer = optim.Adam(net.parameters(),
                                       lr=args.lr,
                                       weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'rmsprop':
                optimizer = optim.RMSprop(net.parameters(),
                                          lr=args.lr,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adagrad':
                optimizer = optim.Adagrad(net.parameters(),
                                          lr=args.lr,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'radam':
                from radam import RAdam

                optimizer = RAdam(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'lars':  # no tensorboardX
                optimizer = LARS(net.parameters(),
                                 lr=args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay,
                                 dampening=args.damping)
            elif args.optimizer.lower() == 'lamb':
                optimizer = Lamb(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'novograd':
                optimizer = NovoGrad(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
            else:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
        train(epoch)
        test(epoch)
    file = open(args.optimizer + str(lr) + 'log.json', 'w+')
    json.dump([train_acc, valid_acc], file)
    return best_acc
Exemplo n.º 5
0
    # NUM_LAYERS = 3
    NUM_DIRECTIONS = 2
    # DROPOUT = 0.3
    device = torch.device('cuda')
    model = DocumentReader(args.hidden_dim,
                           args.emb_dim,
                           args.layers,
                           NUM_DIRECTIONS,
                           args.dropout,
                           device).to(device)
    model = torch.nn.DataParallel(model)

    writer = SummaryWriter(comment="_nlp_%s_%s_%s" % (args.optimizer, args.batch_size, args.learning_rate))
    weight_decay = args.learning_rate / args.epochs
    if args.optimizer == 'lamb':
        optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay,
                         betas=(.9, .999), adam=False, writer=writer)
    elif args.optimizer == 'lars':
        base_optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=weight_decay)
        optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001, writer=writer)
    elif args.optimizer == 'sgd':
        optimizer = SGD(model.parameters(), lr=args.learning_rate, momentum=0.9,
                        weight_decay=weight_decay, writer=writer)
    else:
        # use adam optimizer
        optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay,
                         betas=(.9, .999), adam=True, writer=writer)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    ckpt_dir_name = "%s_%s_%s" % (args.working_dir, args.optimizer, args.batch_size)
    model, optimizer = load_pretrained_model(model, optimizer,
                                             "%s/ckpt/%s" % (ckpt_dir_name, "best_weights.pt"))
Exemplo n.º 6
0
if __name__ == "__main__":

    # tokenizer = RobertaTokenizerFast.from_pretrained(
    #    "roberta-large" if not bool(int(os.environ.get("ROBERTA"))) else "xlm-roberta-base")
    # model = DualEncoderPerformer(num_tokens=tokenizer.vocab_size, max_seq_len=512, dim=512, depth=6, heads=8)
    # distilbert-base-multilingual-cased
    # distilroberta-base
    # DeepPavlov/bert-base-multilingual-cased-sentence
    # bert-base-multilingual-cased

    tokenizer = AutoTokenizer.from_pretrained(
        "distilbert-base-multilingual-cased")
    model = DualEncoder()
    model.save_pretrained("./results/test.bin")
    model = DualEncoder.from_pretrained("./results/test.bin")
    optimizer = Lamb(model.parameters(), lr=0.001)  # Lamb
    sentence1_tensor = tokenizer(
        ["Ich bin Andre", "Ich brauche hilfe", "Du magst tanzen?"],
        add_special_tokens=True,
        return_tensors="pt",
        padding=True)
    sentence2_tensor = tokenizer(
        ["I am Andre", "I need support", "do you like dancing?"],
        add_special_tokens=True,
        return_tensors="pt",
        padding=True)

    sentence1_test = tokenizer(["Ich bin Andre", "Ich bin Andre"],
                               add_special_tokens=True,
                               return_tensors="pt",
                               padding=True)
Exemplo n.º 7
0
Arquivo: train.py Projeto: jxbz/nero
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.wd)

    elif args.optimizer == 'adam':
        print("using adam!")
        optimizer = optim.Adam(net.parameters(),
                               lr=args.lr,
                               betas=(args.momentum, args.beta),
                               weight_decay=args.wd)

    elif args.optimizer == 'lamb':
        print("using lamb!")
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         betas=(args.momentum, args.beta),
                         weight_decay=args.wd)
    elif args.optimizer == 'lambcs':
        print("using lambcs!")
        optimizer = LambCS(net.parameters(),
                           lr=args.lr,
                           betas=(args.momentum, args.beta),
                           weight_decay=args.wd,
                           constraints=True)

    elif args.optimizer == 'madam':
        print("using madam!")
        optimizer = Madam(net.parameters(), lr=args.lr)

    elif args.optimizer == 'madamcs':
        print("using madamcs!")
Exemplo n.º 8
0
if args.optimizer.lower() == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'radam':
    optimizer = RAdam(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'lamb':
    optimizer = Lamb(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001)
else:
    optimizer = optim.SGD(model.parameters(), lr=0.01)

optname = args.optimizer if len(sys.argv) >= 2 else 'sgd'

# log = open(optname+'log.txt','w+')

log = None

criterion = nn.CrossEntropyLoss()

model, optimizer, _ = training_loop(model, criterion, optimizer, train_loader,
                                    valid_loader, N_EPOCHS, DEVICE, log)
Exemplo n.º 9
0
def train(args, model, tokenizer, query_cache, passage_cache):
    """ Train the model """
    #if args.local_rank in [-1, 0]:
    tb_writer = None
    if is_first_worker():
        tb_writer = SummaryWriter(log_dir=args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (
        torch.distributed.get_world_size() if args.local_rank != -1 else 1)

    # layerwise optimization for lamb
    optimizer_grouped_parameters = []
    for layer_name in [
            "roberta.embeddings", "score_out", "downsample1", "downsample2",
            "downsample3"
    ]:
        layer = getattr_recursive(model, layer_name)
        if layer is not None:
            optimizer_grouped_parameters.append({"params": layer.parameters()})
    if getattr_recursive(model, "roberta.encoder.layer") is not None:
        for layer in model.roberta.encoder.layer:
            optimizer_grouped_parameters.append({"params": layer.parameters()})

    if len(optimizer_grouped_parameters) == 0:
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

    if args.optimizer.lower() == "lamb":
        optimizer = Lamb(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         eps=args.adam_epsilon)
    elif args.optimizer.lower() == "adamw":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    else:
        raise Exception(
            "optimizer {0} not recognized! Can only be lamb or adamW".format(
                args.optimizer))

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(
            os.path.join(args.model_name_or_path,
                         "optimizer.pt")) and args.load_optimizer_scheduler:
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    #logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Max steps = %d", args.max_steps)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)

    global_step = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        if "-" in args.model_name_or_path:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
        else:
            global_step = 0
        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from global step %d", global_step)

    tr_loss = 0.0
    model.zero_grad()
    model.train()
    set_seed(args)  # Added here for reproductibility

    last_ann_no = -1
    train_dataloader = None
    train_dataloader_iter = None
    dev_ndcg = 0
    step = 0

    if args.single_warmup:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=args.max_steps)

    while global_step < args.max_steps:

        if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0:
            # check if new ann training data is availabe
            ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir)
            if ann_path is not None and ann_no != last_ann_no:
                logger.info("Training on new add data at %s", ann_path)
                with open(ann_path, 'r') as f:
                    ann_training_data = f.readlines()
                dev_ndcg = ndcg_json['ndcg']
                ann_checkpoint_path = ndcg_json['checkpoint']
                ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path)

                aligned_size = (len(ann_training_data) //
                                args.world_size) * args.world_size
                ann_training_data = ann_training_data[:aligned_size]

                logger.info("Total ann queries: %d", len(ann_training_data))
                if args.triplet:
                    train_dataset = StreamingDataset(
                        ann_training_data,
                        GetTripletTrainingDataProcessingFn(
                            args, query_cache, passage_cache))
                else:
                    train_dataset = StreamingDataset(
                        ann_training_data,
                        GetTrainingDataProcessingFn(args, query_cache,
                                                    passage_cache))
                train_dataloader = DataLoader(train_dataset,
                                              batch_size=args.train_batch_size)
                train_dataloader_iter = iter(train_dataloader)

                # re-warmup
                if not args.single_warmup:
                    scheduler = get_linear_schedule_with_warmup(
                        optimizer,
                        num_warmup_steps=args.warmup_steps,
                        num_training_steps=len(ann_training_data))

                if args.local_rank != -1:
                    dist.barrier()

                if is_first_worker():
                    # add ndcg at checkpoint step used instead of current step
                    tb_writer.add_scalar("dev_ndcg", dev_ndcg,
                                         ann_checkpoint_no)
                    if last_ann_no != -1:
                        tb_writer.add_scalar("epoch", last_ann_no,
                                             global_step - 1)
                    tb_writer.add_scalar("epoch", ann_no, global_step)
                last_ann_no = ann_no

        try:
            batch = next(train_dataloader_iter)
        except StopIteration:
            logger.info("Finished iterating current dataset, begin reiterate")
            train_dataloader_iter = iter(train_dataloader)
            batch = next(train_dataloader_iter)

        batch = tuple(t.to(args.device) for t in batch)
        step += 1

        if args.triplet:
            inputs = {
                "query_ids": batch[0].long(),
                "attention_mask_q": batch[1].long(),
                "input_ids_a": batch[3].long(),
                "attention_mask_a": batch[4].long(),
                "input_ids_b": batch[6].long(),
                "attention_mask_b": batch[7].long()
            }
        else:
            inputs = {
                "input_ids_a": batch[0].long(),
                "attention_mask_a": batch[1].long(),
                "input_ids_b": batch[3].long(),
                "attention_mask_b": batch[4].long(),
                "labels": batch[6]
            }

        # sync gradients only at gradient accumulation step
        if step % args.gradient_accumulation_steps == 0:
            outputs = model(**inputs)
        else:
            with model.no_sync():
                outputs = model(**inputs)
        loss = outputs[
            0]  # model outputs are always tuple in transformers (see doc)

        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            if step % args.gradient_accumulation_steps == 0:
                loss.backward()
            else:
                with model.no_sync():
                    loss.backward()

        tr_loss += loss.item()
        if step % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                logs = {}
                loss_scalar = tr_loss / args.logging_steps
                learning_rate_scalar = scheduler.get_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                tr_loss = 0

                if is_first_worker():
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    logger.info(json.dumps({**logs, **{"step": global_step}}))

            if is_first_worker(
            ) and args.save_steps > 0 and global_step % args.save_steps == 0:
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir,
                                          "checkpoint-{}".format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = (
                    model.module if hasattr(model, "module") else model
                )  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

                torch.save(args, os.path.join(output_dir, "training_args.bin"))
                logger.info("Saving model checkpoint to %s", output_dir)

                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))
                logger.info("Saving optimizer and scheduler states to %s",
                            output_dir)

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        tb_writer.close()

    return global_step
Exemplo n.º 10
0
  def __init__(self, G_ch=64, dim_z=128, bottom_width=4, resolution=128,
               G_kernel_size=3, G_attn='64', n_classes=1000,
               num_G_SVs=1, num_G_SV_itrs=1,
               G_shared=True, shared_dim=0, hier=False,
               cross_replica=False, mybn=False,
               G_activation=nn.ReLU(inplace=False),
               G_lr=5e-5, G_B1=0.0, G_B2=0.999, adam_eps=1e-8,
               BN_eps=1e-5, SN_eps=1e-12, G_mixed_precision=False, G_fp16=False,
               G_init='ortho', skip_init=False, no_optim=False,
               G_param='SN', norm_style='bn', lamb=False,
               **kwargs):
    super(Generator, self).__init__()
    # Channel width mulitplier
    self.ch = G_ch
    # Dimensionality of the latent space
    self.dim_z = dim_z
    # The initial spatial dimensions
    self.bottom_width = bottom_width
    # Resolution of the output
    self.resolution = resolution
    # Kernel size?
    self.kernel_size = G_kernel_size
    # Attention?
    self.attention = G_attn
    # number of classes, for use in categorical conditional generation
    self.n_classes = n_classes
    # Use shared embeddings?
    self.G_shared = G_shared
    # Dimensionality of the shared embedding? Unused if not using G_shared
    self.shared_dim = shared_dim if shared_dim > 0 else dim_z
    # Hierarchical latent space?
    self.hier = hier
    # Cross replica batchnorm?
    self.cross_replica = cross_replica
    # Use my batchnorm?
    self.mybn = mybn
    # nonlinearity for residual blocks
    self.activation = G_activation
    # Initialization style
    self.init = G_init
    # Parameterization style
    self.G_param = G_param
    # Normalization style
    self.norm_style = norm_style
    # Epsilon for BatchNorm?
    self.BN_eps = BN_eps
    # Epsilon for Spectral Norm?
    self.SN_eps = SN_eps
    # fp16?
    self.fp16 = G_fp16
    # Architecture dict
    self.arch = G_arch(self.ch, self.attention)[resolution]

    # If using hierarchical latents, adjust z
    if self.hier:
      # Number of places z slots into
      self.num_slots = len(self.arch['in_channels']) + 1
      self.z_chunk_size = (self.dim_z // self.num_slots)
      # Recalculate latent dimensionality for even splitting into chunks
      self.dim_z = self.z_chunk_size *  self.num_slots
    else:
      self.num_slots = 1
      self.z_chunk_size = 0

    #print("G PARAM: {}".format(self.G_param))
    # Which convs, batchnorms, and linear layers to use
    if self.G_param == 'SN':
      self.which_conv = functools.partial(layers.SNConv2d,
                          kernel_size=3, padding=1,
                          num_svs=num_G_SVs, num_itrs=num_G_SV_itrs,
                          eps=self.SN_eps)
      self.which_linear = functools.partial(layers.SNLinear,
                          num_svs=num_G_SVs, num_itrs=num_G_SV_itrs,
                          eps=self.SN_eps)
    else:
      self.which_conv = functools.partial(nn.Conv2d, kernel_size=3, padding=1)
      self.which_linear = nn.Linear
      
    # We use a non-spectral-normed embedding here regardless;
    # For some reason applying SN to G's embedding seems to randomly cripple G
    self.which_embedding = nn.Embedding
    bn_linear = (functools.partial(self.which_linear, bias=False) if self.G_shared
                 else self.which_embedding)
    self.which_bn = functools.partial(layers.ccbn,
                          which_linear=bn_linear,
                          cross_replica=self.cross_replica,
                          mybn=self.mybn,
                          input_size=(self.shared_dim + self.z_chunk_size if self.G_shared
                                      else self.n_classes),
                          norm_style=self.norm_style,
                          eps=self.BN_eps)


    # Prepare model
    # If not using shared embeddings, self.shared is just a passthrough
    self.shared = (self.which_embedding(n_classes, self.shared_dim) if G_shared 
                    else layers.identity())
    # First linear layer
    self.linear = self.which_linear(self.dim_z // self.num_slots,
                                    self.arch['in_channels'][0] * (self.bottom_width **2))

    # self.blocks is a doubly-nested list of modules, the outer loop intended
    # to be over blocks at a given resolution (resblocks and/or self-attention)
    # while the inner loop is over a given block
    self.blocks = []
    for index in range(len(self.arch['out_channels'])):
      self.blocks += [[layers.GBlock(in_channels=self.arch['in_channels'][index],
                             out_channels=self.arch['out_channels'][index],
                             which_conv=self.which_conv,
                             which_bn=self.which_bn,
                             activation=self.activation,
                             upsample=(functools.partial(F.interpolate, scale_factor=2)
                                       if self.arch['upsample'][index] else None))]]

      # If attention on this block, attach it to the end
      if self.arch['attention'][self.arch['resolution'][index]]:
        print('Adding attention layer in G at resolution %d' % self.arch['resolution'][index])
        self.blocks[-1] += [layers.Attention(self.arch['out_channels'][index], self.which_conv)]

    # Turn self.blocks into a ModuleList so that it's all properly registered.
    self.blocks = nn.ModuleList([nn.ModuleList(block) for block in self.blocks])

    # output layer: batchnorm-relu-conv.
    # Consider using a non-spectral conv here
    self.output_layer = nn.Sequential(layers.bn(self.arch['out_channels'][-1],
                                                cross_replica=self.cross_replica,
                                                mybn=self.mybn),
                                    self.activation,
                                    self.which_conv(self.arch['out_channels'][-1], 3))

    # Initialize weights. Optionally skip init for testing.
    if not skip_init:
      self.init_weights()

    # DEBUG / TODO: remove
    print("LAMB: {}".format(lamb))

    # Set up optimizer
    # If this is an EMA copy, no need for an optim, so just return now
    if no_optim:
      return
    self.lr, self.B1, self.B2, self.adam_eps = G_lr, G_B1, G_B2, adam_eps
    if G_mixed_precision:
      print('Using fp16 adam in G...')
      import utils
      self.optim = utils.Adam16(params=self.parameters(), lr=self.lr,
                           betas=(self.B1, self.B2), weight_decay=0,
                           eps=self.adam_eps)
    elif lamb:
        self.optim = Lamb(self.parameters(), lr=self.lr, weight_decay=0, betas=(self.B1, self.B2), eps=self.adam_eps, adam=False)
    else:
      self.optim = optim.Adam(params=self.parameters(), lr=self.lr,
                           betas=(self.B1, self.B2), weight_decay=0,
                           eps=self.adam_eps)
Exemplo n.º 11
0
n_data_test = len(dataset_test)
batch_size = 2**10
dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=4)
dataloader_test = torch.utils.data.DataLoader(dataset_test,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=4)
print(n_data_train)
print(n_data_test)

lr = 3e-4
betas = (0.9, 0.999)
optimizer = Lamb(model.parameters(), lr=lr, betas=betas, weight_decay=0.1)
#optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=betas)
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)#, betas=betas)

n_epochs = 1000
lossliste = torch.zeros(n_epochs).to(device)
accliste_train = torch.zeros(n_epochs).to(device)
accliste_test = torch.zeros(n_epochs).to(device)

param_idx = 0
print("gogogo")
for epoch in range(start_epoch, n_epochs + start_epoch):
    epochstart = dt.now().timestamp()
    total_loss = 0
    acc_train = 0
    acc_test = 0
Exemplo n.º 12
0
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.wd)

    elif args.optimizer == 'adam':
        print("using adam!")
        optimizer = optim.Adam(net.parameters(),
                               lr=args.lr,
                               betas=(args.momentum, args.beta),
                               weight_decay=args.wd)

    elif args.optimizer == 'lamb':
        print("using lamb!")
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         betas=(args.momentum, args.beta),
                         weight_decay=args.wd)
    elif args.optimizer == 'lambcs':
        print("using lambcs!")
        optimizer = LambCS(net.parameters(),
                           lr=args.lr,
                           betas=(args.momentum, args.beta),
                           weight_decay=args.wd,
                           constraints=True)

    elif args.optimizer == 'madam':
        print("using madam!")
        optimizer = Madam(net.parameters(), lr=args.lr)

    elif args.optimizer == 'madamcs':
        print("using madamcs!")
Exemplo n.º 13
0
def train(args, train_dataset, model_d, model_g, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_d_grouped_parameters = [
        {
            "params": [
                p for n, p in model_d.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model_d.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    # optimizer_d = AdamW(optimizer_d_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    optimizer_d = Lamb(optimizer_d_grouped_parameters,
                       lr=args.learning_rate,
                       betas=(0.9, 0.999),
                       eps=1e-6)
    scheduler_d = get_linear_schedule_with_warmup(
        optimizer_d,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    optimizer_g_grouped_parameters = [
        {
            "params": [
                p for n, p in model_g.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model_g.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    # optimizer_g = AdamW(optimizer_g_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    optimizer_g = Lamb(optimizer_g_grouped_parameters,
                       lr=args.learning_rate,
                       betas=(0.9, 0.999),
                       eps=1e-6)
    scheduler_g = get_linear_schedule_with_warmup(
        optimizer_g,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer_d.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler_d.pt")):
        # Load in optimizer and scheduler states
        optimizer_d.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path,
                                    "optimizer_d.pt")))
        scheduler_d.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path,
                                    "scheduler_d.pt")))
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer_g.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler_g.pt")):
        # Load in optimizer and scheduler states
        optimizer_g.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path,
                                    "optimizer_g.pt")))
        scheduler_g.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path,
                                    "scheduler_g.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model_d, optimizer_d = amp.initialize(model_d,
                                              optimizer_d,
                                              opt_level=args.fp16_opt_level)
        model_g, optimizer_g = amp.initialize(model_g,
                                              optimizer_g,
                                              opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model_d = torch.nn.DataParallel(model_d)
        model_g = torch.nn.DataParallel(model_g)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model_d = torch.nn.parallel.DistributedDataParallel(
            model_d,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )
        model_g = torch.nn.parallel.DistributedDataParallel(
            model_g,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    model_to_resize_d = model_d.module if hasattr(
        model_d,
        "module") else model_d  # Take care of distributed/parallel training
    # model_to_resize_d.resize_token_embeddings(len(tokenizer))
    model_to_resize_g = model_g.module if hasattr(
        model_g,
        "module") else model_g  # Take care of distributed/parallel training
    # model_to_resize_g.resize_token_embeddings(len(tokenizer))

    # model_to_resize_d.bert.embeddings = model_to_resize_g.bert.embeddings

    tr_loss, logging_loss = 0.0, 0.0
    tr_loss_d, logging_loss_d = 0.0, 0.0
    tr_loss_g, logging_loss_g = 0.0, 0.0
    model_d.zero_grad()
    model_g.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model_d.train()
            model_g.train()
            # batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2]
                    if args.model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            # outputs = model(**inputs)
            # loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            masked_input_ids, mask_labels = mask_tokens(
                inputs['input_ids'], tokenizer, args)
            outputs_g = model_g(
                input_ids=masked_input_ids.to(args.device),
                masked_lm_labels=mask_labels.to(args.device),
                attention_mask=inputs['attention_mask'].to(args.device),
                token_type_ids=inputs['token_type_ids'].to(args.device))
            masked_lm_loss, prediction_scores_g = outputs_g[0], outputs_g[1]

            prediction_g = prediction_scores_g.max(dim=-1)[1].cpu()
            acc_g = (prediction_g[mask_labels >= 0] == mask_labels[
                mask_labels >= 0]).float().mean().item()

            prediction_probs_g = F.softmax(prediction_scores_g, dim=-1).cpu()
            bsz, seq_len, vocab_size = prediction_probs_g.size()
            prediction_samples_g = torch.multinomial(prediction_probs_g.view(
                -1, vocab_size),
                                                     num_samples=1)
            prediction_samples_g = prediction_samples_g.view(bsz, seq_len)
            input_ids_replace = inputs['input_ids'].clone()
            input_ids_replace[mask_labels >= 0] = prediction_samples_g[
                mask_labels >= 0]
            labels_d = input_ids_replace.eq(inputs['input_ids']).long()

            special_tokens_mask = [
                tokenizer.get_special_tokens_mask(
                    val, already_has_special_tokens=True)
                for val in inputs['input_ids'].tolist()
            ]
            labels_d.masked_fill_(torch.tensor(special_tokens_mask,
                                               dtype=torch.bool),
                                  value=-100)
            padding_mask = inputs['input_ids'].eq(tokenizer.pad_token_id)
            labels_d.masked_fill_(padding_mask, value=-100)

            labels_d_ones = labels_d[labels_d >= 0].float().mean().item()
            acc_replace = 1 - ((labels_d == 0).sum().float() /
                               (mask_labels >= 0).sum().float()).item()

            outputs_d = model_d(
                input_ids=input_ids_replace.to(args.device),
                attention_mask=inputs['attention_mask'].to(args.device),
                token_type_ids=inputs['token_type_ids'].to(args.device),
                labels=labels_d.to(args.device))
            loss_d, prediction_scores_d = outputs_d[0], outputs_d[1]
            prediction_d = prediction_scores_d.max(dim=-1)[1].cpu()
            acc_d = (prediction_d[labels_d >= 0] == labels_d[labels_d >= 0]
                     ).float().mean().item()
            acc_d_0 = (prediction_d[labels_d == 0] == labels_d[labels_d == 0]
                       ).float().mean().item()
            acc_d_1 = (prediction_d[labels_d == 1] == labels_d[labels_d == 1]
                       ).float().mean().item()

            if args.n_gpu > 1:
                loss_d = loss_d.mean(
                )  # mean() to average on multi-gpu parallel training
                masked_lm_loss = masked_lm_loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss_d = loss_d / args.gradient_accumulation_steps
                masked_lm_loss = masked_lm_loss / args.gradient_accumulation_steps

            lambd = 50
            loss = loss_d * lambd + masked_lm_loss
            if args.fp16:
                loss_d = loss_d * lambd
                with amp.scale_loss(loss_d, optimizer_d) as scaled_loss_d:
                    scaled_loss_d.backward()
                with amp.scale_loss(masked_lm_loss,
                                    optimizer_g) as scaled_loss_g:
                    scaled_loss_g.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            tr_loss_d += loss_d.item()
            tr_loss_g += masked_lm_loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer_d), args.max_grad_norm)
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer_g), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model_d.parameters(),
                                                   args.max_grad_norm)
                    torch.nn.utils.clip_grad_norm_(model_g.parameters(),
                                                   args.max_grad_norm)

                optimizer_d.step()
                scheduler_d.step()  # Update learning rate schedule
                model_d.zero_grad()
                optimizer_g.step()
                scheduler_g.step()  # Update learning rate schedule
                model_g.zero_grad()

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    # if (
                    #     args.local_rank == -1 and args.evaluate_during_training
                    # ):  # Only evaluate when single GPU otherwise metrics may not average well
                    #     results = evaluate(args, model, tokenizer)
                    #     for key, value in results.items():
                    #         eval_key = "eval_{}".format(key)
                    #         logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    loss_scalar_d = (tr_loss_d -
                                     logging_loss_d) / args.logging_steps
                    loss_scalar_g = (tr_loss_g -
                                     logging_loss_g) / args.logging_steps
                    learning_rate_scalar_d = scheduler_d.get_lr()[0]
                    learning_rate_scalar_g = scheduler_g.get_lr()[0]
                    logs["learning_rate_d"] = learning_rate_scalar_d
                    logs["learning_rate_g"] = learning_rate_scalar_g
                    logs["loss"] = loss_scalar
                    logs["loss_d"] = loss_scalar_d
                    logs["loss_g"] = loss_scalar_g
                    logs["acc_repalce"] = acc_replace
                    logs["acc_d"] = acc_d
                    logs["acc_d_0"] = acc_d_0
                    logs["acc_d_1"] = acc_d_1
                    logs["acc_g"] = acc_g
                    logs["labels_d_ones"] = labels_d_ones
                    logs["masked_ratio"] = (mask_labels >= 0).float().sum(
                    ).item() / (labels_d >= 0).sum().float().item()
                    logging_loss = tr_loss
                    logging_loss_d = tr_loss_d
                    logging_loss_g = tr_loss_g

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                # print(args.save_steps)
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    output_dir_d = os.path.join(
                        output_dir, "checkpoint-d-{}".format(global_step))
                    output_dir_g = os.path.join(
                        output_dir, "checkpoint-g-{}".format(global_step))
                    if not os.path.exists(output_dir_d):
                        os.makedirs(output_dir_d)
                    if not os.path.exists(output_dir_g):
                        os.makedirs(output_dir_g)
                    model_to_save_d = (
                        model_d.module if hasattr(model_d, "module") else
                        model_d)  # Take care of distributed/parallel training
                    model_to_save_g = (
                        model_g.module if hasattr(model_g, "module") else
                        model_g)  # Take care of distributed/parallel training
                    model_to_save_d.save_pretrained(output_dir_d)
                    model_to_save_g.save_pretrained(output_dir_g)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer_d.state_dict(),
                               os.path.join(output_dir_d, "optimizer_d.pt"))
                    torch.save(scheduler_d.state_dict(),
                               os.path.join(output_dir_d, "scheduler_d.pt"))
                    torch.save(optimizer_g.state_dict(),
                               os.path.join(output_dir_d, "optimizer_g.pt"))
                    torch.save(scheduler_g.state_dict(),
                               os.path.join(output_dir_d, "scheduler_g.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemplo n.º 14
0
        mlp_dim=param,  # 64, # Dimension des MLPs im Transformer
        transformer_dropout=0.,  #1, # Dropout des MLP im Transformer
        num_classes=num_classes,  # Anzahl Klassen
        use_mlp=True  # benutze MLP im Encoder
    ).to(device)
    #model = VisualTransformer(inner_dim=p, transformer_depth=1, dim_head=49, attn_heads=3, mlp_dim=49, num_classes=num_classes).to(device)
    #model = torch.load("models/"+modelnames[param_idx]+".pt")
    print(sum([params.numel() for params in model.parameters()]))
    print("mlp_dim", param)

    with open("where.txt", "a+") as file:
        file.write("--- mlp_dim " + str(param) + ", " + str(round(starttime)) +
                   70 * "-" + "\n")
    for epoch in range(start_epoch, n_epochs + start_epoch):
        if epoch == 0:  # warmup
            optimizer = Lamb(model.parameters(), lr=1e-5, betas=betas)
        if epoch == 1:
            for g in optimizer.param_groups:
                g["lr"] = 1e-3
        epochstart = dt.now().timestamp()
        total_loss = 0
        acc_train = 0
        acc_test = 0

        # Training
        model.train()
        for img, labels in dataloader_train:
            #img, labels = batch
            img, labels = img.to(device), labels.to(device)
            #print(labels[0])
            #labelsmat = F.one_hot(labels, num_classes=10).to(device)
Exemplo n.º 15
0
        "BATCH_SIZE_PER_GPU", 5)),  # batch size for evaluation
    warmup_steps=int(os.environ.get("WARMUP_NUM_STEPS", 5)),
    save_steps=int(os.environ.get("STEPS_PER_SAVE", 1000000)),
    logging_steps=int(os.environ.get(
        "STEPS_PER_PRINT",
        1)),  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./tensorboard',  # directory for storing logs
    evaluation_strategy=EvaluationStrategy.STEPS,
    eval_steps=int(os.environ.get("STEPS_PER_SAVE", int(50 / 5))),
    save_total_limit=5,
    prediction_loss_only=True,
    gradient_accumulation_steps=int(
        os.environ.get("GRADIENT_ACCUMULATION_STEPS", 1)),
    max_grad_norm=0.5)
optimizer = Lamb(auto_encoder.parameters(),
                 float(os.environ.get("LEARNING_RATE", 5e-3)))
trainer = CustomTrainer(
    model=auto_encoder,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    data_collator=data_collector_huggingface,
    optimizers=(optimizer, None))

start_time = time.time()
print(f"Starttime {datetime.now()}")
output = trainer.train()
print("Running final evaluation")
trainer.evaluate(eval_dataset)
print(f"Endtime {datetime.now()}")
end_time = time.time()
Exemplo n.º 16
0
print("\nDiscriminator:")
print(f"{sum(p.numel() for p in netD.parameters())} parameters")
print(f"{len(list(netD.parameters()))} tensors")

print(args.optim)

if args.optim == 'adam':
    optG = optim.Adam(netG.parameters(),
                      lr=args.initial_lr,
                      betas=(0.0, 0.999))
    optD = optim.Adam(netD.parameters(),
                      lr=args.initial_lr,
                      betas=(0.9, 0.999))

elif args.optim == 'lamb':
    optG = Lamb(netG.parameters(), lr=args.initial_lr, betas=(0.0, 0.999))
    optD = Lamb(netD.parameters(), lr=args.initial_lr, betas=(0.0, 0.999))

elif args.optim == 'sgd':
    optG = optim.SGD(netG.parameters(), lr=args.initial_lr, momentum=0.0)
    optD = optim.SGD(netD.parameters(), lr=args.initial_lr, momentum=0.0)

elif args.optim == 'nero':
    optG = Nero(netG.parameters(), lr=args.initial_lr)
    optD = Nero(netD.parameters(), lr=args.initial_lr)

else:
    raise Exception("Unsupported optim")

#########################################
#### Train ##############################
Exemplo n.º 17
0
def train(args, model, tokenizer, shuffled_fh, train_fn, configObj, logger):
    """ Train the model """
    #if args.local_rank in [-1, 0]:
    tb_writer = None
    if is_first_worker():
        tb_writer = SummaryWriter(log_dir=args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (
        torch.distributed.get_world_size() if args.local_rank != -1 else 1)

    total_train_steps = len(
        shuffled_fh) * args.num_train_epochs // real_batch_size
    if args.warmup_steps <= 0:
        args.warmup_steps = int(total_train_steps * args.warmup_proportion)

    if args.max_steps > 0:
        t_total = args.max_steps
        #args.num_train_epochs = args.max_steps // (args.expected_train_size // args.gradient_accumulation_steps) + 1
    else:
        # t_total = args.expected_train_size // real_batch_size * args.num_train_epochs
        t_total = total_train_steps
        args.max_steps = total_train_steps

    # layerwise optimization for lamb
    optimizer_grouped_parameters = []
    no_decay = ["bias", "LayerNorm.weight", "layer_norm", "LayerNorm"]
    layer_optim_params = set()
    for layer_name in [
            "bert.embeddings", "score_out", "downsample1", "downsample2",
            "downsample3", "embeddingHead"
    ]:
        layer = getattr_recursive(model, layer_name)
        if layer is not None:
            optimizer_grouped_parameters.append({"params": layer.parameters()})
            for p in layer.parameters():
                layer_optim_params.add(p)

    if getattr_recursive(model, "bert.encoder.layer") is not None:
        for layer in model.bert.encoder.layer:
            optimizer_grouped_parameters.append({"params": layer.parameters()})
            for p in layer.parameters():
                layer_optim_params.add(p)
    optimizer_grouped_parameters.append({
        "params":
        [p for p in model.parameters() if p not in layer_optim_params]
    })

    if len(optimizer_grouped_parameters) == 0:

        optimizer_grouped_parameters = [{
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01
        }, {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        }]
    logger.info("len(optimizer_grouped_parameters): {}".format(
        len(optimizer_grouped_parameters)))  # 1

    if args.optimizer.lower() == "lamb":
        optimizer = Lamb(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         eps=args.adam_epsilon)
    elif args.optimizer.lower() == "adamw":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    else:
        raise Exception(
            "optimizer {0} not recognized! Can only be lamb or adamW".format(
                args.optimizer))

    if args.scheduler.lower() == "linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)
    elif args.scheduler.lower() == "cosine":
        scheduler = CosineAnnealingLR(optimizer, t_total, 1e-8)
    else:
        raise Exception(
            "Scheduler {0} not recognized! Can only be linear or cosine".
            format(args.scheduler))

    # Check if saved optimizer or scheduler states exist
    # TODO: we find this consume huge amount of additional GPU memory with pytorch, thus disable for now
    # if os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) and args.resume:
    # Load in optimizer and scheduler states
    # if is_first_worker():
    #     op_state = torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))
    #     print([len(x['params']) for x in op_state['param_groups']])
    #     real_op_state = optimizer.state_dict()
    #     print([len(x['params']) for x in real_op_state['param_groups']])
    # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("   Train dataset size = %d", len(shuffled_fh))
    logger.info("   Num Epochs = %d", args.num_train_epochs)
    logger.info("   Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "   Total train batch size (w. parallel, distributed & accumulation) = %d",
        real_batch_size)
    logger.info("   Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("   Total optimization steps = %d", t_total)
    logger.info("   LR warmup steps = %d", args.warmup_steps)

    global_step = 0
    eval_cnt = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if (os.path.exists(args.model_name_or_path)
            and args.resume) or args.starting_step > 0:
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = args.starting_step

            if global_step <= 0:
                global_step = int(
                    args.model_name_or_path.split("-")[-1].split("/")[0])

            epochs_trained = global_step // (args.expected_train_size //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                args.expected_train_size // args.gradient_accumulation_steps)

            logger.info(
                "   Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("   Continuing training from epoch %d", epochs_trained)
            logger.info("   Continuing training from global step %d",
                        global_step)
            logger.info("   Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except:
            logger.info("  Start training from a pretrained model")

    tr_loss = 0.0

    tensorboard_scalars = {}
    model.zero_grad()

    eval_cfg = args.eval_configObj  # this is also produced in the load_model_config() method
    eval_fn = wrapped_process_fn(tokenizer, args, eval_cfg)

    ideal_path = args.eval_ideal_path
    is_first_eval = (eval_cnt == 0)

    best_checkpoints = []
    set_seed(args)  # Added here for reproductibility

    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    for m_epoch in train_iterator:
        # shuffle input after first epoch
        if m_epoch > 0:
            shuffled_fh.change_seed(m_epoch)
        sds = SimplifiedStreamingDataset(shuffled_fh, train_fn,
                                         configObj.ix_func)
        train_dataloader = DataLoader(sds,
                                      batch_size=args.per_gpu_train_batch_size,
                                      num_workers=4,
                                      pin_memory=True)
        acc_accum = []
        model.train()
        for step, batch in tqdm(enumerate(train_dataloader),
                                desc="Iteration",
                                disable=args.local_rank not in [-1, 0]):
            if step % 100 == 0 and step > 0:
                logger.info('train_step: {}'.format(step))
            # Skip past any already trained steps if resuming training
            # if steps_trained_in_current_epoch > 0:
            #     steps_trained_in_current_epoch -= 1
            #     continue

            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "query_ids": batch[0].long(),
                "query_attn_mask": batch[1].long(),
                "meta_ids": batch[3].long(),
                "meta_attn_mask": batch[4].long(),
                "labels": batch[6].float()
            }

            # sync gradients only at gradient accumulation step
            if (step + 1) % args.gradient_accumulation_steps == 0:
                outputs = model(**inputs)
            else:
                with model.no_sync():
                    outputs = model(**inputs)

            loss_combine = outputs[0]
            # assert len(loss_combine) == 3
            loss = loss_combine["Loss/total_loss"]
            sim_combine = outputs[1]
            # assert len(sim_combine) == 8
            acc = outputs[2]
            acc_accum.append(acc.item())

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    loss.backward()
                else:
                    with model.no_sync():
                        loss.backward()
            tr_loss += loss.item()

            # if is_first_worker():
            # print("unique labels: ", torch.unique(inputs["labels"]).int())
            #    print("Similarity combinations: ", sim_combine)

            for key, value in loss_combine.items():
                tensorboard_scalars[key] = tensorboard_scalars.setdefault(
                    key, 0.0) + value.item()
            for key, value in sim_combine.items():
                # print(f"{key}: {value.mean().item()}")
                value = value.mean()
                value[value != value] = 0
                tensorboard_scalars[key] = tensorboard_scalars.setdefault(
                    key, 0.0) + value.item()
                # print(f"tensorboardscalars: {key} : {tensorboard_scalars[key]}")

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    #for key, value in tensorboard_scalars.items():
                    #    print(f"{key}: {value}")
                    if args.evaluate_during_training and global_step % (
                            args.logging_steps_per_eval *
                            args.logging_steps) == 0:
                        if is_first_worker():
                            save_checkpoint(args,
                                            -1,
                                            model,
                                            tokenizer,
                                            logger=logger)

                        model.eval()
                        is_first_eval = (eval_cnt == 0)
                        args.global_step = global_step
                        init_time = time()
                        fidelity = eval_fidelity(args, model, eval_fn,
                                                 eval_cfg.path, ideal_path,
                                                 args.cache_dir, is_first_eval,
                                                 args.eval_full, logger)
                        logger.info("Eval cost time: {}".format(time() -
                                                                init_time))
                        eval_cnt += 1

                        model.train()

                        if is_first_worker():
                            if len(best_checkpoints) < 3:
                                save_checkpoint(args,
                                                global_step,
                                                model,
                                                tokenizer,
                                                optimizer,
                                                scheduler,
                                                logger=logger)
                                best_checkpoints.append(
                                    (global_step, fidelity))
                            else:
                                worst_checkpoint = sorted(
                                    best_checkpoints, key=lambda x: x[1])[0]
                                if fidelity > worst_checkpoint[1]:
                                    save_checkpoint(args,
                                                    global_step,
                                                    model,
                                                    tokenizer,
                                                    optimizer,
                                                    scheduler,
                                                    logger=logger)
                                    worst_cp_path = os.path.join(
                                        args.output_dir,
                                        "checkpoint-{}".format(
                                            str(worst_checkpoint[0])))
                                    shutil.rmtree(worst_cp_path)
                                    best_checkpoints.remove(worst_checkpoint)
                                    best_checkpoints.append(
                                        (global_step, fidelity))
                                else:
                                    logger.info("Fidelity not in top 3!")
                                assert len(best_checkpoints) == 3
                            tb_writer.add_scalar("fidelity", fidelity,
                                                 global_step)

                            logger.info("Fidelity: {0}".format(fidelity))
                        dist.barrier()

                    learning_rate_scalar = scheduler.get_lr()[0]
                    avg_acc = sum(acc_accum) * 1.0 / len(acc_accum)
                    logger.info("Train acc: {}".format(avg_acc))
                    if is_first_worker():
                        tb_writer.add_scalar("Training/learning_rate",
                                             learning_rate_scalar, global_step)
                        tb_writer.add_scalar("Training/epoch", m_epoch,
                                             global_step)
                        tb_writer.add_scalar("Training/accuracy", avg_acc,
                                             global_step)
                        for key, value in tensorboard_scalars.items():
                            tb_writer.add_scalar(key,
                                                 value / args.logging_steps,
                                                 global_step)
                        logger.info(
                            json.dumps({
                                **tensorboard_scalars,
                                **{
                                    "learning_rate": learning_rate_scalar,
                                    "Accuracy": avg_acc,
                                    "step": global_step
                                }
                            }))

                    tensorboard_scalars = {}
                    dist.barrier()

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemplo n.º 18
0
elif sys.argv[1] == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
elif sys.argv[1] == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=0.01)
elif sys.argv[1] == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=0.01)
elif sys.argv[1] == 'radam':
    optimizer = RAdam(model.parameters())
elif sys.argv[1] == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9)
elif sys.argv[1] == 'lamb':
    optimizer = Lamb(model.parameters())
elif sys.argv[1] == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001)
    schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     3 * len(train_loader),
                                                     1e-4)

    def train(train_loader, model, criterion, optimizer, schedular, device):
        '''
        Function for the training step of the training loop
        '''

        model.train()
        running_loss = 0

        for X, y_true in train_loader:
Exemplo n.º 19
0
def train(args, model, tokenizer, train_dataloader):
    """ Train the model """
    #if args.local_rank in [-1, 0]:
    tb_writer = None
    if is_first_worker():
        tb_writer = SummaryWriter(log_dir=args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)

    if args.max_steps > 0:
        t_total = args.max_steps
        #args.num_train_epochs = args.max_steps // (args.expected_train_size // args.gradient_accumulation_steps) + 1 
    else:
        t_total = args.expected_train_size // real_batch_size * args.num_train_epochs    

    # layerwise optimization for lamb
    optimizer_grouped_parameters = []
    layer_optim_params = set()
    for layer_name in ["roberta.embeddings", "score_out", "downsample1", "downsample2", "downsample3", "embeddingHead"]:
         layer = getattr_recursive(model, layer_name)
         if layer is not None:
            optimizer_grouped_parameters.append({"params": layer.parameters()})
            for p in layer.parameters():
                layer_optim_params.add(p)
    if getattr_recursive(model, "roberta.encoder.layer") is not None:
        for layer in model.roberta.encoder.layer:
            optimizer_grouped_parameters.append({"params": layer.parameters()})
            for p in layer.parameters():
                layer_optim_params.add(p)
    optimizer_grouped_parameters.append({"params": [p for p in model.parameters() if p not in layer_optim_params]})
    if len(optimizer_grouped_parameters)==0:
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]   

    if args.optimizer.lower()=="lamb":
        optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    elif args.optimizer.lower()=="adamw":
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    else:
        raise Exception("optimizer {0} not recognized! Can only be lamb or adamW".format(args.optimizer))
    
    if args.scheduler.lower()=="linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
        )
    elif args.scheduler.lower()=="cosine":
        scheduler = CosineAnnealingLR(optimizer, t_total, 1e-8)
    else:
        raise Exception("Scheduler {0} not recognized! Can only be linear or cosine".format(args.scheduler))

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ) and args.load_optimizer_scheduler:
        # Load in optimizer and scheduler states
        # if is_first_worker():
        #     op_state = torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))
        #     print([len(x['params']) for x in op_state['param_groups']])
        #     real_op_state = optimizer.state_dict()
        #     print([len(x['params']) for x in real_op_state['param_groups']])
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    #logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
            epochs_trained = global_step // (args.expected_train_size // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (args.expected_train_size // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except:
            logger.info("  Start training from a pretrained model") 

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for m_epoch in train_iterator:
        #epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", disable=args.local_rank not in [-1, 0]):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device).long() for t in batch)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                outputs = model(*batch)
            else:
                with model.no_sync():
                    outputs = model(*batch)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    loss.backward()
                else:
                    with model.no_sync():
                        loss.backward()          

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if is_first_worker() and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
                dist.barrier()

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if args.evaluate_during_training and global_step % (args.logging_steps_per_eval*args.logging_steps)==0:
                        model.eval()
                        reranking_mrr, full_ranking_mrr = passage_dist_eval(args, model, tokenizer)
                        if is_first_worker():
                            print("Reranking/Full ranking mrr: {0}/{1}".format(str(reranking_mrr), str(full_ranking_mrr)))
                            mrr_dict = {"reranking": float(reranking_mrr), "full_raking": float(full_ranking_mrr)}
                            tb_writer.add_scalars("mrr", mrr_dict, global_step)
                            print(args.output_dir)

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss


                    if is_first_worker():
                        for key, value in logs.items():
                            print(key, type(value))
                            tb_writer.add_scalar(key, value, global_step)
                        tb_writer.add_scalar("epoch", m_epoch, global_step)
                        print(json.dumps({**logs, **{"step": global_step}}))
                    dist.barrier()

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemplo n.º 20
0
def train(args, model, tokenizer, query_cache, passage_cache):
    """ Train the model """
    #if args.local_rank in [-1, 0]:
    tb_writer = None
    if is_first_worker():
        tb_writer = SummaryWriter(log_dir=args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (
        torch.distributed.get_world_size() if args.local_rank != -1 else 1)

    # layerwise optimization for lamb
    optimizer_grouped_parameters = []

    no_decay = ["bias", "w", "b", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    if args.optimizer.lower() == "lamb":
        optimizer = Lamb(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         eps=args.adam_epsilon)
    elif args.optimizer.lower() == "adamw":
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    else:
        raise Exception(
            "optimizer {0} not recognized! Can only be lamb or adamW".format(
                args.optimizer))

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(
            os.path.join(args.model_name_or_path,
                         "optimizer.pt")) and args.load_optimizer_scheduler:
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    #logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Max steps = %d", args.max_steps)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)

    global_step = 0
    eval_cnt = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        if "-" in args.model_name_or_path:
            try:
                global_step = int(
                    args.model_name_or_path.split("-")[-1].split("/")[0])
            except:
                global_step = 0

        else:
            global_step = 0
        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from global step %d", global_step)

    tr_loss = 0.0
    model.zero_grad()
    model.train()
    set_seed(args)  # Added here for reproductibility

    last_ann_no = -1
    train_dataloader = None
    train_dataloader_iter = None
    dev_ndcg = 0
    step = 0

    eval_path = os.path.join(args.data_dir, "eval_full.tsv")
    eval_cfg = L1InputConfig("eval",
                             model,
                             None,
                             eval_path,
                             args.configObj.chunk_cfg,
                             qid=0,
                             docid=1,
                             query=4,
                             title=5,
                             anchor=6,
                             url=7,
                             click=8,
                             desc=9,
                             rating=10,
                             market=12,
                             lang=13)

    def eval_fn(line, i):
        return L1_process_fn(line, i, tokenizer, args, eval_cfg.map,
                             eval_cfg.chunk_cfg)

    model.eval()
    ideal_path = os.path.join(args.data_dir, "ideal_map_UN.tsv")
    is_first_eval = (eval_cnt == 0)

    args.global_step = global_step
    # fidelity = eval_fidelity(args, model, eval_fn, eval_cfg.path, ideal_path, args.data_cache_dir, is_first_eval)
    # eval_cnt+=1
    # print("Fidelity: {0}".format(fidelity))
    # if is_first_worker():
    #     tb_writer.add_scalar("fidelity", fidelity, global_step)

    best_checkpoints = []
    acc_accum = []
    scheduler = None
    while global_step < args.max_steps:
        if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0:
            # check if new ann training data is availabe
            ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir)

            while ann_no == -1 or (ann_path is not None
                                   and ann_no != last_ann_no):
                try:
                    logger.info("Training on new add data at %s", ann_path)
                    with open(ann_path, 'r') as f:
                        ann_training_data = f.readlines()
                    dev_ndcg = ndcg_json['ndcg']
                    ann_checkpoint_path = ndcg_json['checkpoint']
                    ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path)

                    aligned_size = (len(ann_training_data) //
                                    args.world_size) * args.world_size
                    ann_training_data = ann_training_data[:aligned_size]

                    logger.info("Total ann queries: %d",
                                len(ann_training_data))
                    if len(ann_training_data) == 0:
                        time.sleep(300)
                        continue

                    train_dataset = SimplifiedStreamingDataset(
                        ann_training_data,
                        args.configObj.process_fn(args, query_cache,
                                                  passage_cache))
                    train_dataloader = DataLoader(
                        train_dataset,
                        batch_size=args.train_batch_size,
                        num_workers=1)
                    train_dataloader_iter = iter(train_dataloader)

                    # re-warmup
                    if scheduler is None:
                        scheduler = get_linear_schedule_with_warmup(
                            optimizer,
                            num_warmup_steps=args.warmup_steps,
                            num_training_steps=args.max_steps)

                    if args.local_rank != -1:
                        dist.barrier()

                    if is_first_worker():
                        # add ndcg at checkpoint step used instead of current step
                        # tb_writer.add_scalar("dev_ndcg", dev_ndcg, ann_checkpoint_no)
                        if last_ann_no != -1:
                            tb_writer.add_scalar("epoch", last_ann_no,
                                                 global_step - 1)
                        tb_writer.add_scalar("epoch", ann_no, global_step)
                    last_ann_no = ann_no
                    break
                except:
                    if is_first_worker():
                        print("wait")
                    time.sleep(300)

                ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir)

        try:
            batch = next(train_dataloader_iter)
        except StopIteration:
            logger.info("Finished iterating current dataset, begin reiterate")
            train_dataloader_iter = iter(train_dataloader)
            batch = next(train_dataloader_iter)

        batch = tuple(t.to(args.device).long() for t in batch)
        step += 1
        model.train()
        # if args.triplet:
        #     inputs = {"query_ids": batch[0].long(), "attention_mask_q": batch[1].long(),
        #                 "input_ids_a": batch[3].long(), "attention_mask_a": batch[4].long(),
        #                 "input_ids_b": batch[6].long(), "attention_mask_b": batch[7].long()}
        # else:
        #     inputs = {"input_ids_a": batch[0].long(), "attention_mask_a": batch[1].long(),
        #                 "input_ids_b": batch[3].long(), "attention_mask_b": batch[4].long(),
        #                 "labels": batch[6]}

        # sync gradients only at gradient accumulation step
        if step % args.gradient_accumulation_steps == 0:
            outputs = model(*batch)
        else:
            with model.no_sync():
                outputs = model(*batch)
        loss = outputs[
            0]  # model outputs are always tuple in transformers (see doc)
        acc = outputs[2]
        acc_accum.append(acc.item())

        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            if step % args.gradient_accumulation_steps == 0:
                loss.backward()
            else:
                with model.no_sync():
                    loss.backward()

        tr_loss += loss.item()
        if step % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            # print("w grad:", model.module.w.grad)
            # print("w:", model.module.w)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                logs = {}
                if global_step % (args.logging_steps_per_eval *
                                  args.logging_steps) == 0:
                    print("Train acc:", sum(acc_accum) * 1.0 / len(acc_accum))
                    acc_accum = []
                    model.eval()
                    is_first_eval = (eval_cnt == 0)
                    args.global_step = global_step
                    fidelity = eval_fidelity(args, model, eval_fn,
                                             eval_cfg.path, ideal_path,
                                             args.data_cache_dir,
                                             is_first_eval)
                    eval_cnt += 1

                    if is_first_worker():
                        if len(best_checkpoints) < 10:
                            save_checkpoint(args, global_step, model,
                                            tokenizer, optimizer, scheduler)
                            best_checkpoints.append((global_step, fidelity))
                        else:
                            worst_checkpoint = sorted(best_checkpoints,
                                                      key=lambda x: x[1])[0]
                            if fidelity > worst_checkpoint[1]:
                                save_checkpoint(args, global_step, model,
                                                tokenizer, optimizer,
                                                scheduler)
                                worst_cp_path = os.path.join(
                                    args.output_dir, "checkpoint-{}".format(
                                        str(worst_checkpoint[0])))
                                shutil.rmtree(worst_cp_path)
                                best_checkpoints.remove(worst_checkpoint)
                                best_checkpoints.append(
                                    (global_step, fidelity))
                            else:
                                print("Fidelity not in top 10!")
                            assert len(best_checkpoints) == 10

                        save_checkpoint(args, -1, model, tokenizer)
                        print("Fidelity: {0}".format(fidelity))
                        logs["fidelity"] = fidelity
                    dist.barrier()
                loss_scalar = tr_loss / args.logging_steps
                learning_rate_scalar = scheduler.get_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                tr_loss = 0

                if is_first_worker():
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    logger.info(json.dumps({**logs, **{"step": global_step}}))

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        tb_writer.close()

    return global_step
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'dyna':
    from dyna import Dyna
    optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
# lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
# lr_scheduler = LambdaLR(optimizer,lrs)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)

batch_acumulate = args.batch_size//256
batch_per_step = len(trainloader)//batch_acumulate+int(len(trainloader)%batch_acumulate>0)
Exemplo n.º 22
0
def worker_fn(rank, world_size):
    setup(rank, world_size)

    weights_filename = "weights.pt"
    batch_size = 512
    epochs = 240
    warmup_epochs = 8
    use_mixed_precision = True

    batch_size = batch_size // world_size #batch size per worker

    #Data
    all_data = os.listdir(datapath_preprocessed)
    train_filenames = [p for p in all_data if re.match(r'^PGM_' + re.escape(dataset_name) + r'_train_(\d+)\.npz$', p) is not None]
    val_filenames = [p for p in all_data if re.match(r'^PGM_' + re.escape(dataset_name) + r'_val_(\d+)\.npz$', p) is not None]
    train_dataset = PgmDataset(train_filenames)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=8, pin_memory=False, sampler=train_sampler)#shuffle is done by the sampler
    val_dataloader = DataLoader(PgmDataset(val_filenames), batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False)

    #Model
    device_ids = [rank]

    model = WReN(2).to(device_ids[0])#3-layer MLRN

    if weights_filename is not None and os.path.isfile("./" + weights_filename):
        model.load_state_dict(torch.load(weights_filename, map_location='cpu'))
        print('Weights loaded')
        cold_start = False
    else:
        print('No weights found')
        cold_start = True

    #Loss and optimizer
    final_lr = 2e-3

    def add_module_params_with_decay(module, weight_decay, param_groups):#adds parameters with decay unless they are bias parameters, which shouldn't receive decay
        group_with_decay = []
        group_without_decay = []
        for name, param in module.named_parameters():
            if not param.requires_grad: continue
            if name == 'bias' or name.endswith('bias'):
                group_without_decay.append(param)
            else:
                group_with_decay.append(param)
        param_groups.append({"params": group_with_decay, "weight_decay": weight_decay})
        param_groups.append({"params": group_without_decay})

    optimizer_param_groups = [
    ]

    add_module_params_with_decay(model.conv, 2e-1, optimizer_param_groups)
    add_module_params_with_decay(model.post_cnn_linear, 2e-1, optimizer_param_groups)
    add_module_params_with_decay(model.g, 2e-1, optimizer_param_groups)
    add_module_params_with_decay(model.h, 2e-1, optimizer_param_groups)
    add_module_params_with_decay(model.f, 2e-1, optimizer_param_groups)
    add_module_params_with_decay(model.f_final, 2e-1, optimizer_param_groups)

    optimizer = Lamb(optimizer_param_groups, lr=final_lr)

    base_model = model
    if use_mixed_precision:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1") #Mixed Precision

    lossFunc = torch.nn.CrossEntropyLoss()
    softmax = torch.nn.Softmax(dim=1)

    #Parallel distributed model
    device = device_ids[0]
    torch.cuda.set_device(device)
    parallel_model = torch.nn.parallel.DistributedDataParallel(model, device_ids)

    if rank == 0:
        #accuracy logging
        sess = tf.Session()
        train_acc_placeholder = tf.placeholder(tf.float32, shape=())
        train_acc_summary = tf.summary.scalar('training_acc', train_acc_placeholder)
        val_acc_placeholder = tf.placeholder(tf.float32, shape=())
        val_acc_summary = tf.summary.scalar('validation_acc', val_acc_placeholder)
        writer = tf.summary.FileWriter("log", sess.graph)

    #training loop
    acc = []
    global_step = 0
    for epoch in range(epochs): 
        train_sampler.set_epoch(epoch) 

        # Validation
        val_acc = []
        parallel_model.eval()
        with torch.no_grad():
            for i, (local_batch, local_labels) in enumerate(val_dataloader):
                local_batch, targets = local_batch.to(device), local_labels.to(device)

                #answer = model(local_batch.type(torch.float32))
                answer, _ = parallel_model(local_batch.type(torch.float32))

                #Calc accuracy
                answerSoftmax = softmax(answer)
                maxIndex = answerSoftmax.argmax(dim=1)

                correct = maxIndex.eq(targets)
                accuracy = correct.type(dtype=torch.float16).mean(dim=0)
                val_acc.append(accuracy)

                if i % 50 == 0 and rank == 0:
                    print("batch " + str(i))

        total_val_acc = sum(val_acc) / len(val_acc)
        print('Validation accuracy: ' + str(total_val_acc.item()))
        if rank == 0:
            summary = sess.run(val_acc_summary, feed_dict={val_acc_placeholder: total_val_acc.item()})
            writer.add_summary(summary, global_step=global_step)

        # Training
        parallel_model.train()
        for i, (local_batch, local_labels) in enumerate(train_dataloader):
            global_step = global_step + 1

            if cold_start and epoch < warmup_epochs:#linear scaling of the lr for warmup during the first few epochs
                lr = final_lr * global_step / (warmup_epochs*len(train_dataset) / (batch_size * world_size))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

            local_batch, targets = local_batch.to(device_ids[0]), local_labels.to(device_ids[0])

            optimizer.zero_grad()
            answer, activation_loss = parallel_model(local_batch.type(torch.float32))

            loss = lossFunc(answer, targets) + activation_loss * 2e-3

            #Calc accuracy
            answerSoftmax = softmax(answer)
            maxIndex = answerSoftmax.argmax(dim=1)

            correct = maxIndex.eq(targets)
            accuracy = correct.type(dtype=torch.float16).mean(dim=0)
            acc.append(accuracy)
            
            #Training step
            if use_mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss: #Mixed precision
                    scaled_loss.backward()
            else:
                loss.backward()

            grad_norm = torch.nn.utils.clip_grad_norm_(parallel_model.parameters(), 1e1)

            optimizer.step()

            if i % 50 == 0 and rank == 0:
                print("epoch " + str(epoch) + " batch " + str(i))
                print("loss", loss)
                print("activation loss", activation_loss)
                print(grad_norm)

            #logging and saving weights
            if i % 1000 == 999:
                trainAcc = sum(acc) / len(acc)
                acc = []
                print('Training accuracy: ' + str(trainAcc.item()))
                if rank == 0:
                    if weights_filename is not None:
                        torch.save(base_model.state_dict(), weights_filename)
                        print('Weights saved')

                    summary = sess.run(train_acc_summary, feed_dict={train_acc_placeholder: trainAcc.item()})
                    writer.add_summary(summary, global_step=global_step)  

        if cold_start and weights_filename is not None and epoch % 10 == 0 and rank == 0:
            torch.save(base_model.state_dict(), weights_filename + "_cp" + str(epoch))
            print('Checkpoint saved')


    cleanup()
Exemplo n.º 23
0
from lamb import Lamb
    
# Load phase and group velocity data from an Excel file exported from 
# Dispersion software (1 mm aluminum plate)

df_vp = pd.read_excel('alum1mm.xlsx', skiprows = 9, usecols = 'U:AN', 
                      header=None)
df_vg = pd.read_excel('alum1mm.xlsx', skiprows = 9, usecols = 'A:T', 
                      header=None)

# Create an instance of the same material using the Lamb class.

alum = Lamb(thickness=1, 
            nmodes_sym=5, 
            nmodes_antisym=5, 
            fd_max=10000, 
            vp_max=15000, 
            c_L=6420,
            c_S=3040)

# Plot phase velocity using the Lamb class.

fig1, ax1 = alum.plot_phase_velocity(material_velocities=False,
                                     cutoff_frequencies=False,
                                     sym_style={'color' : 'black'}, 
                                     antisym_style={'color' : 'black'})

# Remove the legend that labels Symmetric and Antisymmetric modes
# (we are interested in labeling only Lamb module and Dispersion).

ax1.get_legend().remove()  
Exemplo n.º 24
0
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.NLLLoss()

if args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=args.momentum)
elif args.optim == 'adam':
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 betas=(args.momentum, args.beta))
elif args.optim == 'lamb':
    optimizer = Lamb(model.parameters(),
                     lr=args.lr,
                     betas=(args.momentum, args.beta))
elif args.optim == 'nero':
    optimizer = Nero(model.parameters(), lr=args.lr)

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
E = 68.9e9  # E = Young's modulus, in Pa.
p = 2700  # p = Density (rho), in kg/m3.
v = 0.33  # v = Poisson's ratio (nu).

c_L = np.sqrt(E * (1 - v) / (p * (1 + v) * (1 - 2 * v)))
c_S = np.sqrt(E / (2 * p * (1 + v)))
c_R = c_S * ((0.862 + 1.14 * v) / (1 + v))

# Example: A 10 mm aluminum plate.

alum = Lamb(thickness=10,
            nmodes_sym=5,
            nmodes_antisym=5,
            fd_max=10000,
            vp_max=15000,
            c_L=c_L,
            c_S=c_S,
            c_R=c_R,
            material='Aluminum')

# Plot phase velocity, group velocity and wavenumber.

alum.plot_phase_velocity()
alum.plot_group_velocity()
alum.plot_wave_number()

# Plot wave structure (displacement profiles across thickness) for A0
# and S0 modes at different fd values.

alum.plot_wave_structure(mode='A0',
Exemplo n.º 26
0
                              weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(model.parameters(),
                      lr=args.base_lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    from lars import LARS
    optimizer = LARS(model.parameters(),
                     lr=args.base_lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer = Lamb(model.parameters(),
                     lr=args.base_lr,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(model.parameters(),
                         lr=args.base_lr,
                         weight_decay=args.weight_decay)
    lr_scheduler = [
        optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader),
                                             1e-4)
    ]
else:
    optimizer = optim.SGD(model.parameters(),
                          lr=args.base_lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)