예제 #1
0
#-----------------------------------------
# Tensorboard writer:
tb_writer = SummaryWriter(log_dir=f'{logs_path}/{datetime.now().strftime("%d%m%Y-%H_%M_%S")}/')
#-----------------------------------------

#-----------------------------------------
# Creating the optimiser:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': mlp.parameters(), 'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
#-----------------------------------------

#-----------------------------------------
# Loading the contents of the auxiliary checkpoint and instantiating the contents if not resuming:
if aux_checkpoint:
    global_step = aux_checkpoint['global_step']
    epoch = aux_checkpoint['epoch']
    optimizer.load_state_dict(aux_checkpoint['optimizer'])
    best_acc = aux_checkpoint['best_acc']
    mlp.load_state_dict(aux_checkpoint['mlp_state_dict'])
    best_checkpoint_path = aux_checkpoint['best_checkpoint_path']
else:
    global_step = 0
    best_acc = 0.0
    epoch = 0
예제 #2
0
def trainner(model, play_history, train_config: dict):
    model.train()
    train_history, valid_history, split_point = play_history.get_train_valid_data(rate=train_config['traindata_rate'])
    train_dataset = AlphaDataset(play_histry=train_history)
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=train_config['batch_size'],
        shuffle=True,
        num_workers=train_config['num_workers'],
        collate_fn=AlphaDataset.collate_fn,
        pin_memory=True,
    )

    if valid_history is not None:
        valid_dataset = AlphaDataset(play_histry=valid_history)
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=train_config['batch_size'] * 2,
            shuffle=False,
            num_workers=train_config['num_workers'],
            collate_fn=AlphaDataset.collate_fn,
            pin_memory=True,
        )
    else:
        valid_loader = None

    optimizer = AdamW(params=model.parameters(), lr=train_config['base_lr'])
    scheduler = lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer, T_max=train_config['epochs'], eta_min=train_config['min_lr']
    )

    for epoch in range(train_config['epochs']):
        train_value_mean = Avg()
        train_policy_mean = Avg()
        for states, actions, winners in train_loader:
            optimizer.zero_grad()

            value, policy = model(states)
            value_loss = functional.mse_loss(input=value.view(-1), target=winners)
            policy_loss = functional.cross_entropy(input=policy, target=actions)

            loss = train_config['value_loss_weight'] * value_loss + train_config['policy_loss_weight'] * policy_loss

            loss.backward()
            optimizer.step()

            train_value_mean.update(value=value_loss.item())
            train_policy_mean.update(value=policy_loss.item())

        scheduler.step()

        if valid_loader is not None:
            valid_value_mean = Avg()
            valid_policy_mean = Avg()
            for states, actions, winners in valid_loader:
                with torch.no_grad():
                    value, policy = model(states)
                    value_loss = functional.mse_loss(input=value.view(-1), target=winners)
                    policy_loss = functional.cross_entropy(input=policy, target=actions)

                value_loss = value_loss.item()
                policy_loss = policy_loss.item()

                valid_value_mean.update(value=value_loss)
                valid_policy_mean.update(value=policy_loss)

        msg = f'epochs: [{epoch}/{train_config["epochs"]}]'
        msg += f' - train value loss: {train_value_mean():.6f} - train policy loss: {train_policy_mean():.6f}'
        if valid_loader is not None:
            msg += f' - valid value loss: {valid_value_mean():.6f} - valid policy loss: {valid_policy_mean():.6f}'
        logging.info(msg=msg)
    model.eval()
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

lr = 0.00002

# bioBERT:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
pretrained_model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1')

pretrained_model.cuda()
pretrained_model.eval()

optimizer = AdamW(pretrained_model.parameters(), lr=lr, eps=1e-8)

data = []
f = open('corpus.txt', 'r')
for line in f:
    data.append(line)

input_ids = []
attention_masks = []
for sentence in data:
    # encoded_sentence = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
예제 #4
0
    def __init__(self, params: dict, dataset: LmSeqsDataset,
                 token_probs: torch.tensor, student: nn.Module,
                 teacher: nn.Module):
        logger.info('Initializing Distiller')
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
        self.fp16 = params.fp16

        self.student = student
        self.teacher = teacher

        self.student_config = student.config
        self.vocab_size = student.config.vocab_size

        if params.n_gpu <= 1:
            sampler = RandomSampler(dataset)
        else:
            sampler = DistributedSampler(dataset)

        if params.group_by_size:
            groups = create_lengths_groups(lengths=dataset.lengths,
                                           k=params.max_model_input_size)
            sampler = GroupedBatchSampler(sampler=sampler,
                                          group_ids=groups,
                                          batch_size=params.batch_size)
        else:
            sampler = BatchSampler(sampler=sampler,
                                   batch_size=params.batch_size,
                                   drop_last=False)

        self.dataloader = DataLoader(dataset=dataset,
                                     batch_sampler=sampler,
                                     collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
        assert self.temperature > 0.

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos

        self.mlm = params.mlm
        if self.mlm:
            logger.info(f'Using MLM loss for LM step.')
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor(
                [params.word_mask, params.word_keep, params.word_rand])
            self.pred_probs = self.pred_probs.to(
                f'cuda:{params.local_rank}'
            ) if params.n_gpu > 0 else self.pred_probs
            self.token_probs = token_probs.to(
                f'cuda:{params.local_rank}'
            ) if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
            logger.info(f'Using CLM loss for LM step.')

        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sequences_epoch = 0
        self.total_loss_epoch = 0
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
        if self.alpha_mse > 0.: self.last_loss_mse = 0
        if self.alpha_cos > 0.: self.last_loss_cos = 0
        self.last_log = 0

        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
        if self.alpha_mse > 0.:
            self.mse_loss_fct = nn.MSELoss(reduction='sum')
        if self.alpha_cos > 0.:
            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')

        logger.info('--- Initializing model optimizer')
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = int(
            self.num_steps_epoch / params.gradient_accumulation_steps *
            params.n_epoch) + 1

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in student.named_parameters()
                if not any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            params.weight_decay
        }, {
            'params': [
                p for n, p in student.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            0.0
        }]
        logger.info(
            "------ Number of trainable parameters (student): %i" % sum([
                p.numel() for p in self.student.parameters() if p.requires_grad
            ]))
        logger.info("------ Number of parameters (student): %i" %
                    sum([p.numel() for p in self.student.parameters()]))
        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=params.learning_rate,
                               eps=params.adam_epsilon,
                               betas=(0.9, 0.98))

        warmup_steps = math.ceil(num_train_optimization_steps *
                                 params.warmup_prop)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=num_train_optimization_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            logger.info(
                f"Using fp16 training: {self.params.fp16_opt_level} level")
            self.student, self.optimizer = amp.initialize(
                self.student,
                self.optimizer,
                opt_level=self.params.fp16_opt_level)
            self.teacher = self.teacher.half()

        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel
                logger.info(
                    "Using apex.parallel.DistributedDataParallel for distributed training."
                )
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel
                logger.info(
                    "Using nn.parallel.DistributedDataParallel for distributed training."
                )
                self.student = DistributedDataParallel(
                    self.student,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    find_unused_parameters=True)

        self.is_master = params.is_master
        if self.is_master:
            logger.info('--- Initializing Tensorboard')
            self.tensorboard = SummaryWriter(
                log_dir=os.path.join(self.dump_path, 'log', 'train'))
            self.tensorboard.add_text(tag='config/training',
                                      text_string=str(self.params),
                                      global_step=0)
            self.tensorboard.add_text(tag='config/student',
                                      text_string=str(self.student_config),
                                      global_step=0)
예제 #5
0
def main(rank, args):

    # Distributed setup

    if args.distributed:
        setup_distributed(rank, args.world_size)

    not_main_rank = args.distributed and rank != 0

    logging.info("Start time: %s", datetime.now())

    # Explicitly set seed to make sure models created in separate processes
    # start from same random weights and biases
    torch.manual_seed(args.seed)

    # Empty CUDA cache
    torch.cuda.empty_cache()

    # Change backend for flac files
    torchaudio.set_audio_backend("soundfile")

    # Transforms

    melkwargs = {
        "n_fft": args.win_length,
        "n_mels": args.n_bins,
        "hop_length": args.hop_length,
    }

    sample_rate_original = 16000

    if args.type == "mfcc":
        transforms = torch.nn.Sequential(
            torchaudio.transforms.MFCC(
                sample_rate=sample_rate_original,
                n_mfcc=args.n_bins,
                melkwargs=melkwargs,
            ), )
        num_features = args.n_bins
    elif args.type == "waveform":
        transforms = torch.nn.Sequential(UnsqueezeFirst())
        num_features = 1
    else:
        raise ValueError("Model type not supported")

    if args.normalize:
        transforms = torch.nn.Sequential(transforms, Normalize())

    augmentations = torch.nn.Sequential()
    if args.freq_mask:
        augmentations = torch.nn.Sequential(
            augmentations,
            torchaudio.transforms.FrequencyMasking(
                freq_mask_param=args.freq_mask),
        )
    if args.time_mask:
        augmentations = torch.nn.Sequential(
            augmentations,
            torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask),
        )

    # Text preprocessing

    char_blank = "*"
    char_space = " "
    char_apostrophe = "'"
    labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase
    language_model = LanguageModel(labels, char_blank, char_space)

    # Dataset

    training, validation = split_process_librispeech(
        [args.dataset_train, args.dataset_valid],
        [transforms, transforms],
        language_model,
        root=args.dataset_root,
        folder_in_archive=args.dataset_folder_in_archive,
    )

    # Decoder

    if args.decoder == "greedy":
        decoder = GreedyDecoder()
    else:
        raise ValueError("Selected decoder not supported")

    # Model

    model = Wav2Letter(
        num_classes=language_model.length,
        input_type=args.type,
        num_features=num_features,
    )

    if args.jit:
        model = torch.jit.script(model)

    if args.distributed:
        n = torch.cuda.device_count() // args.world_size
        devices = list(range(rank * n, (rank + 1) * n))
        model = model.to(devices[0])
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=devices)
    else:
        devices = ["cuda" if torch.cuda.is_available() else "cpu"]
        model = model.to(devices[0], non_blocking=True)
        model = torch.nn.DataParallel(model)

    n = count_parameters(model)
    logging.info("Number of parameters: %s", n)

    # Optimizer

    if args.optimizer == "adadelta":
        optimizer = Adadelta(
            model.parameters(),
            lr=args.learning_rate,
            weight_decay=args.weight_decay,
            eps=args.eps,
            rho=args.rho,
        )
    elif args.optimizer == "sgd":
        optimizer = SGD(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    elif args.optimizer == "adam":
        optimizer = Adam(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    elif args.optimizer == "adamw":
        optimizer = AdamW(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    else:
        raise ValueError("Selected optimizer not supported")

    if args.scheduler == "exponential":
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    elif args.scheduler == "reduceonplateau":
        scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3)
    else:
        raise ValueError("Selected scheduler not supported")

    criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank],
                                 zero_infinity=False)

    # Data Loader

    collate_fn_train = collate_factory(model_length_function, augmentations)
    collate_fn_valid = collate_factory(model_length_function)

    loader_training_params = {
        "num_workers": args.workers,
        "pin_memory": True,
        "shuffle": True,
        "drop_last": True,
    }
    loader_validation_params = loader_training_params.copy()
    loader_validation_params["shuffle"] = False

    loader_training = DataLoader(
        training,
        batch_size=args.batch_size,
        collate_fn=collate_fn_train,
        **loader_training_params,
    )
    loader_validation = DataLoader(
        validation,
        batch_size=args.batch_size,
        collate_fn=collate_fn_valid,
        **loader_validation_params,
    )

    # Setup checkpoint

    best_loss = 1.0

    load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint)

    if args.distributed:
        torch.distributed.barrier()

    if load_checkpoint:
        logging.info("Checkpoint: loading %s", args.checkpoint)
        checkpoint = torch.load(args.checkpoint)

        args.start_epoch = checkpoint["epoch"]
        best_loss = checkpoint["best_loss"]

        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])

        logging.info("Checkpoint: loaded '%s' at epoch %s", args.checkpoint,
                     checkpoint["epoch"])
    else:
        logging.info("Checkpoint: not found")

        save_checkpoint(
            {
                "epoch": args.start_epoch,
                "state_dict": model.state_dict(),
                "best_loss": best_loss,
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            },
            False,
            args.checkpoint,
            not_main_rank,
        )

    if args.distributed:
        torch.distributed.barrier()

    torch.autograd.set_detect_anomaly(False)

    for epoch in range(args.start_epoch, args.epochs):

        logging.info("Epoch: %s", epoch)

        train_one_epoch(
            model,
            criterion,
            optimizer,
            scheduler,
            loader_training,
            decoder,
            language_model,
            devices[0],
            epoch,
            args.clip_grad,
            not_main_rank,
            not args.reduce_lr_valid,
        )

        loss = evaluate(
            model,
            criterion,
            loader_validation,
            decoder,
            language_model,
            devices[0],
            epoch,
            not_main_rank,
        )

        if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(loss)

        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "state_dict": model.state_dict(),
                "best_loss": best_loss,
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            },
            is_best,
            args.checkpoint,
            not_main_rank,
        )

    logging.info("End time: %s", datetime.now())

    if args.distributed:
        torch.distributed.destroy_process_group()
예제 #6
0
def main():
    # my dice shows 777 only. period.
    random.seed(EXPCONF.seed)
    np.random.seed(EXPCONF.seed)
    torch.manual_seed(EXPCONF.seed)
    torch.cuda.manual_seed_all(EXPCONF.seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tempconf = EXPCONF.copy()
    tempconf.datamode = 'test'

    testloader, ___, _____ = get_loader(tempconf)
    trainloader, __, _trainds = get_loader(EXPCONF, getdev=False)
    devloader, _, _devds = get_loader(EXPCONF, getdev=True)

    assert len(trainloader) > 0, f"trainloader is empty!"
    assert len(devloader) > 0, f"devloader is empty!"

    # this is disgraceful.... but just specify things below
    model_weight, vocab, trained_condition = loadmodel_info(EXPCONF)

    albertconf = retrieve_conf(trained_condition, vocab)
    albert = AlbertForPreTraining(albertconf)
    albert.load_state_dict(model_weight)
    albert = albert.to(device)

    global_step = 0
    L = len(trainloader)
    bsz = len(trainloader[0])

    if not EXPCONF.infer_now:
        albert = albert.albert
        albert.eval()  # freeze

        cls = MLP(EXPCONF, albertconf.hidden_size, 2).to(device)
        cls.train()
        for p in cls.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        # huggingface example is doing this for language modeling...
        # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py
        optimizer = AdamW(cls.parameters(),
                          lr=EXPCONF.cls_lr)  # otherwise, use default
        getsch = get_cosine_schedule_with_warmup if EXPCONF.cls_sch == 'cosine' else get_linear_schedule_with_warmup
        scheduler = getsch(optimizer, EXPCONF.cls_warmups,
                           EXPCONF.cls_numsteps)

        ## train cls only!
        while global_step < EXPCONF.cls_numsteps:
            lossep_pp = 0
            accep_pp = 0
            cls.train()
            for i, (b, l, datasetids) in enumerate(
                    tqdm(trainloader, desc="iterations progress"), 1):
                outputs = albert(**b, return_dict=True)
                global_step += 1

                logits = cls(outputs.pooler_output)
                losspp = F.cross_entropy(logits, l)

                lossppval = losspp.item()
                acc = accuracy(logits.clone().detach(), l)

                wandb.log({
                    'step':
                    global_step,
                    'cls.train_step/learning_rate':
                    get_lr_from_optim(optimizer),
                    'cls.train_step/pp_loss':
                    lossppval,
                    'cls.train_step/pp_acc':
                    acc,
                })

                optimizer.step()
                scheduler.step()
                cls.zero_grad()

                lossep_pp += lossppval
                accep_pp += acc
                if global_step % EXPCONF.logevery == 0:
                    lossep_pp /= L
                    accep_pp /= L

                    wandb.log({
                        'cls.train_ep/pp_loss': lossep_pp,
                        'cls.train_ep/pp_acc': accep_pp,
                    })
                    devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls,
                                                    devloader, global_step)
                    if devpp_acc > EXPCONF.savethld:
                        savemodel(EXPCONF,
                                  albert,
                                  cls,
                                  vocab,
                                  global_step,
                                  acc=devpp_acc)
                        write_sub(EXPCONF,
                                  albert,
                                  cls,
                                  global_step,
                                  acc=devpp_acc,
                                  testloader=testloader)

    else:  # infer now
        cls = None
        devpp_loss, devpp_acc = evaldev(EXPCONF,
                                        albert,
                                        cls,
                                        devloader,
                                        global_step,
                                        infernow=EXPCONF.infer_now)
        write_sub(EXPCONF,
                  albert,
                  cls,
                  global_step,
                  acc=devpp_acc,
                  testloader=testloader,
                  infernow=EXPCONF.infer_now)

    return None
예제 #7
0
bertlm_t.load_state_dict(torch.load(f'./bert_pytorch/model/model_saved/bert.ep248.mdl'))

bertlm.bert.embedding = bertlm_t.bert.embedding

bertlm_t.eval()
for param in bertlm_t.parameters():
    param.requires_grad = False

if start_epoch != 0:
    try:
        bertlm.load_state_dict(torch.load(f'./bert_pytorch/model/model_saved_kd/bert_kd.ep{start_epoch-1}.mdl'))
    except:
        raise Exception("No File detected")

optimizer = AdamW(bertlm.bert.encoder.parameters(), lr=1e-4)
criteria = nn.CrossEntropyLoss()
kd_criteria = FSPLoss(t_layer=12, s_layer=6, stride=2)

params = filter(lambda p: p.requires_grad, bertlm.parameters())
num_params = sum([np.prod(p.size()) for p in params])
print("# of params:", num_params)

cuda = True
loss_list = []
batch_size = args.batch_size
epoch = args.epoch
mask_only = True if args.mask_only=='True' else False

print(f"mask_only {mask_only}")
def get_optimizer(optimizer_name: str,
                  parameters,
                  learning_rate: float,
                  weight_decay=1e-5,
                  **kwargs):
    if optimizer_name.lower() == "sgd":
        return SGD(parameters,
                   lr=learning_rate,
                   momentum=0.9,
                   nesterov=True,
                   weight_decay=weight_decay,
                   **kwargs)

    if optimizer_name.lower() == "adam":
        return Adam(parameters,
                    lr=learning_rate,
                    weight_decay=weight_decay,
                    eps=1e-5,
                    **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "rms":
        return RMSprop(parameters,
                       lr=learning_rate,
                       weight_decay=weight_decay,
                       **kwargs)

    if optimizer_name.lower() == "adamw":
        return AdamW(parameters,
                     lr=learning_rate,
                     weight_decay=weight_decay,
                     eps=1e-5,
                     **kwargs)

    if optimizer_name.lower() == "radam":
        return RAdam(parameters,
                     lr=learning_rate,
                     weight_decay=weight_decay,
                     eps=1e-5,
                     **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "over9000":
        return Over9000(parameters,
                        lr=learning_rate,
                        weight_decay=weight_decay,
                        eps=1e-5,
                        **kwargs)

    # if optimizer_name.lower() == "ranger":
    #     return Ranger(parameters, learning_rate, weight_decay=weight_decay,
    #                   **kwargs)

    # if optimizer_name.lower() == "qhadamw":
    #     return QHAdamW(parameters, learning_rate, weight_decay=weight_decay,
    #                    **kwargs)
    #
    if optimizer_name.lower() == "lamb":
        return Lamb(parameters,
                    learning_rate,
                    weight_decay=weight_decay,
                    **kwargs)

    if optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        return FusedLAMB(parameters,
                         learning_rate,
                         weight_decay=weight_decay,
                         **kwargs)

    if optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        return FusedAdam(parameters,
                         learning_rate,
                         eps=1e-5,
                         weight_decay=weight_decay,
                         adam_w_mode=True,
                         **kwargs)

    raise ValueError("Unsupported optimizer name " + optimizer_name)
예제 #9
0
def main():
    global WANDB_STEP
    args = get_args()
    print(args)

    set_seed(args.seed)

    device = th.device("cpu" if args.devid < 0 else f"cuda:{args.devid}")
    args.device = device
    aux_device = th.device(
        "cpu" if args.aux_devid < 0 else f"cuda:{args.aux_devid}")
    args.aux_device = aux_device

    TEXT = torchtext.data.Field(batch_first=True)

    if args.dataset == "ptb":
        Dataset = PennTreebank
    elif args.dataset == "wikitext2":
        Dataset = WikiText2

    train, valid, test = Dataset.splits(
        TEXT,
        newline_eos=True,
    )

    TEXT.build_vocab(train)
    V = TEXT.vocab

    def batch_size_tokens(new, count, sofar):
        return max(len(new.text), sofar)

    def batch_size_sents(new, count, sofar):
        return count

    if args.iterator == "bucket":
        train_iter, valid_iter, test_iter = BucketIterator.splits(
            (train, valid, test),
            batch_sizes=[args.bsz, args.eval_bsz, args.eval_bsz],
            device=device,
            sort_key=lambda x: len(x.text),
            batch_size_fn=batch_size_tokens
            if args.bsz_fn == "tokens" else batch_size_sents,
        )
    elif args.iterator == "bptt":
        train_iter, valid_iter, test_iter = BPTTIterator.splits(
            (train, valid, test),
            batch_sizes=[args.bsz, args.eval_bsz, args.eval_bsz],
            device=device,
            bptt_len=args.bptt,
            sort=False,
        )
    else:
        raise ValueError(f"Invalid iterator {args.iterator}")

    if args.no_shuffle_train:
        train_iter.shuffle = False

    name = get_name(args)
    import tempfile
    wandb.init(project="hmm-lm",
               name=name,
               config=args,
               dir=tempfile.mkdtemp())
    args.name = name

    model = None
    from models.factoredhmmlm import FactoredHmmLm
    model = FactoredHmmLm(V, args)
    model.to(device)
    print(model)
    num_params, num_trainable_params = count_params(model)
    print(f"Num params, trainable: {num_params:,}, {num_trainable_params:,}")
    wandb.run.summary["num_params"] = num_params

    if args.eval_only:
        model.load_state_dict(th.load(args.eval_only)["model"])
        v_start_time = time.time()
        if args.model == "mshmm" or args.model == "factoredhmm":
            if args.num_classes > 2**15:
                eval_fn = mixed_cached_eval_loop
            else:
                eval_fn = cached_eval_loop
        elif args.model == "hmm":
            eval_fn = cached_eval_loop
        else:
            eval_fn = eval_loop
        valid_losses, valid_n = eval_fn(
            args,
            V,
            valid_iter,
            model,
        )
        report(valid_losses, valid_n, f"Valid perf", v_start_time)

        t_start_time = time.time()
        test_losses, test_n = eval_fn(
            args,
            V,
            test_iter,
            model,
        )
        report(test_losses, test_n, f"Test perf", t_start_time)

        sys.exit()

    parameters = list(model.parameters())
    if args.optimizer == "adamw":
        optimizer = AdamW(
            parameters,
            lr=args.lr,
            betas=(args.beta1, args.beta2),
            weight_decay=args.wd,
        )
    elif args.optimizer == "sgd":
        optimizer = SGD(
            parameters,
            lr=args.lr,
        )
    if args.schedule == "reducelronplateau":
        scheduler = ReduceLROnPlateau(
            optimizer,
            factor=1. / args.decay,
            patience=args.patience,
            verbose=True,
            mode="max",
        )
    elif args.schedule == "noam":
        warmup_steps = args.warmup_steps

        def get_lr(step):
            scale = warmup_steps**0.5 * min(step**(-0.5),
                                            step * warmup_steps**(-1.5))
            return args.lr * scale

        scheduler = LambdaLR(
            optimizer,
            get_lr,
            last_epoch=-1,
            verbse=True,
        )
    else:
        raise ValueError("Invalid schedule options")

    for e in range(args.num_epochs):
        start_time = time.time()
        if args.log_counts > 0 and args.keep_counts > 0:
            # reset at START of epoch
            model.state_counts.fill_(0)
        train_losses, train_n = train_loop(
            args,
            V,
            train_iter,
            model,
            parameters,
            optimizer,
            scheduler,
            valid_iter=valid_iter if not args.overfit else None,
            verbose=True,
        )
        total_time = report(train_losses, train_n, f"Train epoch {e}",
                            start_time)

        v_start_time = time.time()
        if args.model == "mshmm" or args.model == "factoredhmm":
            if args.num_classes > 2**15:
                eval_fn = mixed_cached_eval_loop
            else:
                eval_fn = cached_eval_loop
        elif args.model == "hmm":
            eval_fn = cached_eval_loop
        else:
            eval_fn = eval_loop
        valid_losses, valid_n = eval_fn(args, V, valid_iter, model)
        report(valid_losses, valid_n, f"Valid epoch {e}", v_start_time)

        if args.schedule in valid_schedules:
            scheduler.step(valid_losses.evidence
                           if not args.overfit else train_losses.evidence)

        update_best_valid(valid_losses, valid_n, model, optimizer, scheduler,
                          args.name)

        wandb.log(
            {
                "train_loss": train_losses.evidence / train_n,
                "train_ppl": math.exp(-train_losses.evidence / train_n),
                "epoch_time": total_time,
                "valid_loss": valid_losses.evidence / valid_n,
                "valid_ppl": math.exp(-valid_losses.evidence / valid_n),
                "best_valid_loss": BEST_VALID / valid_n,
                "best_valid_ppl": math.exp(-BEST_VALID / valid_n),
                "epoch": e,
            },
            step=WANDB_STEP)

        if args.log_counts > 0 and args.keep_counts > 0:
            counts = (model.counts / model.counts.sum(0, keepdim=True))[:, 4:]
            c, v = counts.shape
            cg2 = counts > 1e-2

            # state counts
            # log these once per epoch, then set back to zero
            sc0 = (model.state_counts == 0).sum()
            sc1 = (model.state_counts == 1).sum()
            sc2 = (model.state_counts == 2).sum()
            sc3 = (model.state_counts == 3).sum()
            sc4 = (model.state_counts == 4).sum()
            sc5 = (model.state_counts >= 5).sum()

            wandb.log(
                {
                    "avgcounts@1e-2": cg2.sum().item() / float(v),
                    "maxcounts@1e-2": cg2.sum(0).max().item(),
                    "mincounts@1e-2": cg2.sum(0).min().item(),
                    "maxcounts": counts.sum(0).max().item(),
                    "mincounts": counts.sum(0).min().item(),
                    "statecounts=0": sc0,
                    "statecounts=1": sc1,
                    "statecounts=2": sc2,
                    "statecounts=3": sc3,
                    "statecounts=4": sc4,
                    "statecounts>=5": sc5,
                },
                step=WANDB_STEP)
            del cg2
            del counts

    # won't use best model. Rerun with eval_only
    t_start_time = time.time()
    test_losses, test_n = eval_fn(
        args,
        V,
        test_iter,
        model,
    )
    report(test_losses, test_n, f"Test perf", t_start_time)
예제 #10
0
                          num_workers=num_workers)
valid_loader = DataLoader(valid_dataset,
                          batch_size=bs,
                          shuffle=False,
                          num_workers=num_workers)

loaders = {"train": train_loader, "valid": valid_loader}

num_epochs = get_dict_value_or_default(config, 'epochs', 100)
logdir = "/var/data/bengali" + str(args.fold) + '_config_' + str(
    args.config) + '_comment_' + args.comment

lr = get_dict_value_or_default(dict_=config, key='lr', default_value=args.lr)

if config['opt'] == 'adamw':
    optimizer = AdamW(params=model.parameters(), lr=lr)
elif config['opt'] == 'adam':
    optimizer = Adam(params=model.parameters(), lr=lr)
elif config['opt'] == 'sgd':
    optimizer = SGD(params=model.parameters(),
                    lr=lr,
                    momentum=0.9,
                    nesterov=True)
elif config['opt'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(params=model.parameters(), lr=lr)
elif config['opt'] == 'radam':
    optimizer = RAdam(params=model.parameters(), lr=lr)
else:
    raise Exception(config['opt'] + ' is not supported')

scheduler = make_scheduler_from_config(optimizer=optimizer, config=config)
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--mode', choices=['train', 'validate', 'predict'], default='train')
    arg('--run_root', default='.')
    arg('--batch-size', type=int, default=16)
    arg('--step', type=int, default=1)
    arg('--workers', type=int, default=0)
    arg('--lr', type=float, default=0.00003)
    arg('--adam_epsilon', type=float, default=1e-8)
    arg('--weight_decay', type=float, default=0.0)
    arg('--fold', type=int, default=0)
    arg('--warmup', type=float, default=0.05)
    arg('--limit', type=int)
    arg('--patience', type=int, default=1)
    arg('--clean', action='store_true')
    arg('--n-epochs', type=int, default=20)
    arg('--vocab-size', type=int, default=13318)
    arg('--multi-gpu', type=int, default=0)
    arg('--print-num', type=int, default=5)
    arg('--temperature', type=float)

    args = parser.parse_args()

    df = pd.read_table('../data/dialog-rewrite/corpus.txt',
                       sep="\t\t",
                       names=['a', 'b', 'current', 'label'],
                       dtype=str)
    df.dropna(how='any', inplace=True)
    train_length = int(len(df) * 0.9)

    train_df = df.iloc[:train_length].iloc[:, :]
    valid_df = df.iloc[train_length:]
    print(valid_df.head())
    if args.mode == 'predict':
        # valid_df['current'] = valid_df['label']
        valid_df = pd.read_table('../data/dialog-rewrite/test.csv',
                                 sep=",",
                                 names=['a', 'b', 'current', 'label'],
                                 dtype=str)
        print(valid_df.tail())
    valid_df['eval_label'] = valid_df['label'].apply(
        lambda x: ' '.join(list(x)))

    if args.limit:
        train_df = train_df.iloc[0:args.limit]
        valid_df = valid_df.iloc[0:args.limit]
    # train_df['len'] = train_df['content'].apply(lambda x: len(x))

    run_root = Path('../experiments/' + args.run_root)
    tokenizer = BertTokenizer.from_pretrained("../rbt3")
    valid_set = TaggerRewriterDataset(valid_df, tokenizer, valid=True)
    valid_index = np.array(valid_set.valid_index)
    # np.save('index.npy', valid_index)
    valid_df = valid_df.reset_index().loc[valid_index, :]
    ner_index = np.array(valid_set.label_type) == 1
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              collate_fn=tagger_collate_fn)

    config = BertConfig.from_json_file('../rbt3/config.json')
    config.num_labels = 5
    # # config.is_decoder = True
    # decoder = BertModel.from_pretrained("../rbt3", config=config)
    # encoder = BertModel.from_pretrained("../rbt3")
    # args.vocab_size = config.vocab_size
    bert_path = '../rbt3'
    model = TaggerRewriteModel(config, bert_path)
    model.cuda()

    if args.mode == 'train':
        if run_root.exists() and args.clean:
            shutil.rmtree(run_root)
        run_root.mkdir(exist_ok=True, parents=True)
        (run_root / 'params.json').write_text(
            json.dumps(vars(args), indent=4, sort_keys=True))

        train_set = TaggerRewriterDataset(train_df, tokenizer)

        # np.save('index.npy', train_set.valid_index)

        train_loader = DataLoader(train_set,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.workers,
                                  collate_fn=tagger_collate_fn)

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.lr,
                          eps=args.adam_epsilon)
        t_total = int(len(train_df) * args.n_epochs / args.batch_size)
        warmup_steps = int(t_total * args.warmup)
        # scheduler = get_linear_schedule_with_warmup(
        # optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
        # )
        scheduler = get_constant_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O2',
                                          verbosity=0)

        train(args,
              model,
              optimizer,
              scheduler,
              tokenizer,
              ner_index,
              train_loader=train_loader,
              valid_df=valid_df,
              valid_loader=valid_loader,
              epoch_length=len(train_df))

    elif args.mode == 'validate':
        model_path = run_root / ('tagger_model-%d.pt' % args.fold)
        load_model(model, model_path)
        valid_metrics = validate(model,
                                 valid_loader,
                                 valid_df,
                                 args,
                                 tokenizer,
                                 ner_index,
                                 decode_mode='beam_search')

    elif args.mode == 'predict':
        model_path = run_root / ('tagger_model-%d.pt' % args.fold)
        load_model(model, model_path)
        valid_metrics = validate(model,
                                 valid_loader,
                                 valid_df,
                                 args,
                                 tokenizer,
                                 decode_mode='beam_search')
예제 #12
0
            top1_acc = validate(val_loader, model)
            print("Epoch: {}    Training Loss: {}    Validation Top1 Acc: {}".
                  format(epoch, loss, top1_acc))


# define model
model = ViViT(
    image_size=224,  #
    patch_size=16,  #
    tubelet_temporal_size=2,  #
    num_classes=51,  #
    num_frames=32,  #
    dim=768,  #
    layer_spacial=12,  #
    layer_temporal=4,  #
    heads=12,  #
    dim_head=64,
    dropout=0.,
    emb_dropout=0,
    mlp_dim=3072,
    pretrain=True)
parameters = filter(lambda p: p.requires_grad, model.parameters())
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
print('Trainable Parameters: %.3fM' % parameters)

#define optimizer, criterion
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-3)

train(train_loader, model, criterion, optimizer, 200)
예제 #13
0
    else:
        F1 = torch.jit.load(pt2_model.ILLUSTRATION2VEC)
        F2 = torch.jit.load(pt2_model.VGG16)

        S = pt2_model.Embedding(args.latent_dim)
        G = pt2_model.Generator(args.latent_dim, args.capacity)
        D = pt2_model.Discriminator(args.capacity)

        GP = pt2_model.GradientPenalty(D, λ2)
        MSE = nn.MSELoss()

    to_cuda(F1, F2, S, G, D, GP, MSE)
    to_eval(F1, F2)

    GS_parameters = list(G.parameters()) + list(S.parameters())
    optim_GS = AdamW(GS_parameters, lr=α, betas=β)
    optim_D = AdamW(D.parameters(), lr=α, betas=β)

    # ===============
    # VALIDATION DATA
    # ===============
    _, v_composition, v_hints, v_style, v_illustration = dataset[7]
    c, h, w = v_composition.size()

    v_composition = v_composition.unsqueeze(0).cuda()
    v_hints = v_hints.unsqueeze(0).cuda()
    v_style = v_style.unsqueeze(0).cuda()
    v_illustration = v_illustration.unsqueeze(0).cuda()
    v_noise = torch.rand((1, 1, h, w)).cuda()

    with torch.no_grad():
def get_optimizer(args, model):
    args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps)
    if args.optimizer == 'adamw-bert':
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters,
                                           lr=args.learning_rate,
                                           eps=args.adam_epsilon,
                                           betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
        debug_print('\n - Use Huggingface\'s AdamW Optimizer')
    elif args.optimizer == 'adamw-torch':
        try:
            from torch.optim import AdamW
        except ImportError as e:
            debug_print(f'torch version: {torch.__version__}')
            raise e
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon,
                          betas=(args.beta1, args.beta2))
        scheduler = huggingfaceOptim.WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.warmup_steps,
            t_total=args.max_train_steps)
    elif args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adagrad':
        optimizer = torch.optim.Adagrad(model.parameters(),
                                        lr=args.learning_rate)
        scheduler = None
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.learning_rate,
                                     betas=args.betas,
                                     eps=args.eps,
                                     weight_decay=args.weight_decay)
        scheduler = None
    elif args.rnn_optimizer == 'adamax':
        optimizer = torch.optim.Adamax(model.parameters())  # use default lr
        scheduler = None
    else:
        raise Exception("Unsupported optimizer: {}".format(args.optimizer))
    return optimizer, scheduler
예제 #15
0
train_input_ids, train_attention_mask = tokenizer_data(train_sentences)
test_input_ids, test_attention_mask = tokenizer_data(test_sentences)

print("inputs id is ready")

train_dataset = TensorDataset(torch.tensor(train_input_ids),
                              torch.tensor(train_attention_mask),
                              torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=64)

test_dataset = TensorDataset(torch.tensor(test_input_ids),
                             torch.tensor(test_attention_mask))
test_loader = DataLoader(test_dataset, batch_size=64)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 5

print("training")
model.to(device)
for epoch in range(0, epochs):
    model.train()
    for step, batch in enumerate(train_loader):
        train_input_ids = batch[0].to(device)
        train_attention_mask = batch[1].to(device)
        train_labels = batch[2].to(device)
        model.zero_grad()
        output = model(input_ids=train_input_ids,
                       attention_mask=train_attention_mask,
                       labels=train_labels)
예제 #16
0
파일: train.py 프로젝트: cakn15/bonito
def main(args):

    workdir = os.path.expanduser(args.training_directory)

    if os.path.exists(workdir) and not args.force:
        print("[error] %s exists." % workdir)
        exit(1)

    init(args.seed, args.device)
    device = torch.device(args.device)

    print("[loading data]")
    chunks, chunk_lengths, targets, target_lengths = load_data(
        limit=args.chunks, shuffle=True, directory=args.directory)

    split = np.floor(chunks.shape[0] * args.validation_split).astype(np.int32)
    train_dataset = ChunkDataSet(chunks[:split], chunk_lengths[:split],
                                 targets[:split], target_lengths[:split])
    test_dataset = ChunkDataSet(chunks[split:], chunk_lengths[split:],
                                targets[split:], target_lengths[split:])
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch,
                             num_workers=4,
                             pin_memory=True)

    config = toml.load(args.config)
    argsdict = dict(training=vars(args))

    print("[loading model]")
    model = Model(config)

    weights = os.path.join(workdir, 'weights.tar')
    if os.path.exists(weights): model.load_state_dict(torch.load(weights))

    model.to(device)
    model.train()

    os.makedirs(workdir, exist_ok=True)
    toml.dump({
        **config,
        **argsdict
    }, open(os.path.join(workdir, 'config.toml'), 'w'))

    optimizer = AdamW(model.parameters(), amsgrad=True, lr=args.lr)

    if args.amp:
        try:
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O1",
                                              verbosity=0)
        except NameError:
            print(
                "[error]: Cannot use AMP: Apex package needs to be installed manually, See https://github.com/NVIDIA/apex"
            )
            exit(1)

    schedular = CosineAnnealingLR(optimizer, args.epochs * len(train_loader))

    for epoch in range(1, args.epochs + 1):

        try:
            train_loss, duration = train(model,
                                         device,
                                         train_loader,
                                         optimizer,
                                         use_amp=args.amp)
            val_loss, val_mean, val_median = test(model, device, test_loader)
        except KeyboardInterrupt:
            break

        print(
            "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%"
            .format(epoch, workdir, val_loss, val_mean, val_median))

        torch.save(model.state_dict(),
                   os.path.join(workdir, "weights_%s.tar" % epoch))
        with open(os.path.join(workdir, 'training.csv'), 'a',
                  newline='') as csvfile:
            csvw = csv.writer(csvfile, delimiter=',')
            if epoch == 1:
                csvw.writerow([
                    'time', 'duration', 'epoch', 'train_loss',
                    'validation_loss', 'validation_mean', 'validation_median'
                ])
            csvw.writerow([
                datetime.today(),
                int(duration),
                epoch,
                train_loss,
                val_loss,
                val_mean,
                val_median,
            ])

        schedular.step()
예제 #17
0
def main():
    # 如果可以使用GPU运算,则使用GPU,否则使用CPU
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Use " + str(device))

    # 创建输出文件夹
    if not os.path.exists(config.output_path):
        os.mkdir(config.output_path)

    # 创建dataset
    # create dataset
    file_list = None
    for path, dirs, files in os.walk(config.img_path, topdown=False):
        file_list = list(files)

    train_dataset = image_dataset(file_list, config.img_path, transform=get_transforms(config.img_size))
    train_loader = DataLoader(dataset=train_dataset, batch_size=config.batchSize, shuffle=True)

    # 从model中获取判别器D和生成器G的网络模型
    G_model = get_G_model(config.from_old_model, device, config.G_model_path)
    D_model = get_D_model(config.from_old_model, device, config.D_model_path)

    # 定义G和D的优化器,此处使用AdamW优化器
    G_optimizer = AdamW(G_model.parameters(), lr=3e-4, weight_decay=1e-6)
    D_optimizer = AdamW(D_model.parameters(), lr=3e-4, weight_decay=1e-6)

    # 损失函数
    criterion = config.criterion

    # 混合精度加速
    if config.use_apex:
        G_model, G_optimizer = amp.initialize(G_model, G_optimizer, opt_level="O1")
        D_model, D_optimizer = amp.initialize(D_model, D_optimizer, opt_level="O1")

    # 记录训练时间
    train_start = time.time()

    # 开始训练的每一个epoch
    for epoch in range(config.epochs):
        print("start epoch "+str(epoch+1)+":")
        # 定义一些变量用于记录进度和损失
        batch_num = len(train_loader)
        D_loss_sum = 0
        G_loss_sum = 0
        count = 0

        # 从dataloader中提取数据
        for index, images in enumerate(train_loader):
            count += 1
            # 将图片放入运算设备的内存
            images = images.to(device)

            # 定义真标签,使用标签平滑的策略,生成0.9到1之间的随机数作为真实标签
            # real_labels = (1 - torch.rand(config.batchSize, 1)/10).to(device)
            # 定义真标签,全1
            # real_labels = Variable(torch.ones(config.batchSize, 1)).to(device)
            # 定义真标签,全0.9
            real_labels = (Variable(torch.ones(config.batchSize, 1))-0.1).to(device)

            # 定义假标签,单向平滑,因此不对生成器标签进行平滑处理,全0
            fake_labels = Variable(torch.zeros(config.batchSize, 1)).to(device)

            # 将随机的初始数据喂入生成器生成假图像
            img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device)
            fake_images = G_model(img_seeds)

            # 记录真假标签是否被交换过
            exchange_labels = False

            # 有一定概率在训练判别器时交换label
            if random.uniform(0, 1) < config.D_train_label_exchange:
                real_labels, fake_labels = fake_labels, real_labels
                exchange_labels = True

            # 训练判断器D
            D_optimizer.zero_grad()
            # 用真样本输入判别器
            real_output = D_model(images)

            # 对于数据集末尾的数据,长度不够一个batch size时需要去除过长的真实标签
            if len(real_labels) > len(real_output):
                D_loss_real = criterion(real_output, real_labels[:len(real_output)])
            else:
                D_loss_real = criterion(real_output, real_labels)
            # 用假样本输入判别器
            fake_output = D_model(fake_images)
            D_loss_fake = criterion(fake_output, fake_labels)
            # 将真样本与假样本损失相加,得到判别器的损失
            D_loss = D_loss_real + D_loss_fake
            D_loss_sum += D_loss.item()

            # 重置优化器
            D_optimizer.zero_grad()
            # 用损失更新判别器D
            if config.use_apex:
                with amp.scale_loss(D_loss, D_optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                D_loss.backward()
            D_optimizer.step()

            # 如果之前交换过标签,此时再换回来
            if exchange_labels:
                real_labels, fake_labels = fake_labels, real_labels

            # 训练生成器G
            # 将随机种子数喂入生成器G生成假数据
            img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device)
            fake_images = G_model(img_seeds)
            # 将假数据输入判别器
            fake_output = D_model(fake_images)
            # 将假数据的判别结果与真实标签对比得到损失
            G_loss = criterion(fake_output, real_labels)
            G_loss_sum += G_loss.item()

            # 重置优化器
            G_optimizer.zero_grad()
            # 利用损失更新生成器G
            if config.use_apex:
                with amp.scale_loss(G_loss, G_optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                G_loss.backward()
            G_optimizer.step()

            # 打印程序工作进度
            if (index + 1) % 200 == 0:
                print("Epoch: %2d, Batch: %4d / %4d" % (epoch + 1, index + 1, batch_num))

        if (epoch+1) % 10 == 0:
            # 在每N个epoch结束时保存模型参数到磁盘文件
            torch.save(G_model.state_dict(), config.G_model_path)
            torch.save(D_model.state_dict(), config.D_model_path)
            # 在每N个epoch结束时输出一组生成器产生的图片到输出文件夹
            img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device)
            fake_images = G_model(img_seeds).cuda().data
            # 将假图像缩放到[0,1]的区间
            fake_images = 0.5 * (fake_images + 1)
            fake_images = fake_images.clamp(0, 1)
            # 连接所有生成的图片然后用自带的save_image()函数输出到磁盘文件
            fake_images = fake_images.view(-1, 3, config.img_size, config.img_size)
            save_image(fake_images, config.output_path+str(epoch+1)+'.png')

        # 打印该epoch的损失,时间等数据用于参考
        print("D_loss:", round(D_loss_sum / count, 3))
        print("G_loss:", round(G_loss_sum / count, 3))
        current_time = time.time()
        pass_time = int(current_time - train_start)
        time_string = str(pass_time // 3600) + " hours, " + str((pass_time % 3600) // 60) + " minutes, " + str(
            pass_time % 60) + " seconds."
        print("Time pass:"******"Done.")
예제 #18
0
    # model = BertForQuestionAnswering.from_pretrained('bert-base-chinese')
    model = BertForQuestionAnswering.from_pretrained(
        'hfl/chinese-roberta-wwm-ext-large')
    # model = BertForQuestionAnswering.from_pretrained('roberta_base_lm_finetune')
    # model = BertForQuestionAnswering.from_pretrained('roberta_large_lm_finetune')
    ranker = BertForSequenceClassification.from_pretrained(
        'hfl/chinese-roberta-wwm-ext-large')

    # pdb.set_trace()

    model.to(device)
    ranker.to(device)
    optimizer = AdamW(model.parameters(),
                      lr=lr,
                      betas=(0.9, 0.999),
                      eps=1e-8,
                      weight_decay=1e-2,
                      amsgrad=False)
    optimizer_rank = AdamW(ranker.parameters(),
                           lr=lr,
                           betas=(0.9, 0.999),
                           eps=1e-8,
                           weight_decay=1e-2,
                           amsgrad=False)

    optimizer.zero_grad()
    optimizer_rank.zero_grad()

    # em_current, f1_current = validate(model)

    em_dev_best = 0
예제 #19
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--epoch", type=int, required=True)
    parser.add_argument("--seed", type=int, required=True)
    parser.add_argument("--emb_file", type=str, required=True)
    parser.add_argument("--checkpoint", type=str, required=True)
    parser.add_argument("--save_dir", type=str, required=True)
    parser.add_argument("--train_file", type=str, required=True)
    parser.add_argument("--log_file", type=str, required=False)
    parser.add_argument("--ratio", type=str, required=True)
    parser.add_argument("--vocab_size", type=int, required=True)
    parser.add_argument("--emb_size", type=int, required=True)
    parser.add_argument("--learning_rate", type=float, required=True)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--max_length", type=int, required=True)
    parser.add_argument("--max_grad_norm", type=int, required=True)

    args = parser.parse_args()

    split_ratio = [float(val) for val in args.ratio.split(",")]

    has_cuda = torch.cuda.is_available()

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
    DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
    logging.basicConfig(filename=args.log_file,
                        level=logging.INFO,
                        format=LOG_FORMAT,
                        datefmt=DATE_FORMAT)

    logging.info("start preparing data")
    data_preprocessor = DataPreprocess()
    emb, word_idx_map = data_preprocessor.build_emb_vocab(args.emb_file)
    data_preprocessor.load(args.train_file, use_mask=False, is_test=False)
    train_dataset, dev_dataset = data_preprocessor.generate_train_dev_dataset(
        ratio=split_ratio)
    train_dataset, dev_dataset = CompDataSet(
        train_dataset,
        word_idx_map,
        max_len=args.max_length,
        emb_size=args.emb_size), CompDataSet(dev_dataset,
                                             word_idx_map,
                                             max_len=args.max_length,
                                             emb_size=args.emb_size)

    train_dataset = DataLoader(train_dataset,
                               batch_size=args.batch_size,
                               shuffle=True)
    dev_dataset = DataLoader(dev_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    logging.info("init model")
    start_epoch = 0
    if args.checkpoint:
        model = torch.load(args.checkpoint)
        start_epoch = re.findall("\d+(?=\_\d+.pt)", args.checkpoint)
        start_epoch = int(start_epoch[0]) + 1
    else:
        model = ESIM(args.vocab_size,
                     args.emb_size,
                     emb,
                     max_len=args.max_length)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    criterion = FocalLoss()

    if has_cuda:
        model = model.cuda()

    logging.info("start training")
    neg_auc, pos_auc = validate(model, dev_dataset)
    logging.info(f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")

    for epoch in range(start_epoch, args.epoch):
        running_loss = 0.0
        for step, data in enumerate(train_dataset):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()

            outputs = model(data["premise"], data["premise_mask"],
                            data["hypothese"], data["hypothese_mask"])
            loss = criterion(outputs["probs"], data["label"])
            loss.backward()

            clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()

            end_time = time.time()
            running_loss += loss.item()
            if step % 100 == 99:
                logging.info(
                    f"epoch: {epoch}, step: {step}, time: {end_time - start_time} loss: {running_loss / 100}"
                )
                running_loss = 0
            if step % 500 == 499:
                neg_auc, pos_auc = validate(model, dev_dataset)
                logging.info(
                    f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")
                torch.save(model, Path(args.save_dir) / f"{epoch}_{step}.pt")
    def __init__(self,
                 image_encoder,
                 text_encoder,
                 image_mha,
                 bert_model,
                 optimizer='adam',
                 lr=1e-3,
                 l2_regularization=1e-2,
                 margin_loss=1e-2,
                 max_violation=True,
                 cost_style='mean',
                 use_lr_scheduler=False,
                 grad_clip=0,
                 num_training_steps=30000,
                 device='cuda'):
        self.image_mha = image_mha
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        self.bert_model = bert_model
        self.device = device

        self.use_lr_scheduler = use_lr_scheduler
        self.params = []
        self.params = list(self.image_mha.parameters())
        self.params += list(self.text_encoder.parameters())
        self.params += list(self.image_encoder.parameters())
        self.params += list(self.bert_model.parameters())
        self.grad_clip = grad_clip
        self.frozen = False
        if optimizer == 'adamW':
            self.optimizer = AdamW([{
                'params':
                list(self.bert_model.parameters()),
                'lr':
                3e-5
            }, {
                'params':
                list(self.image_encoder.parameters()) +
                list(self.text_encoder.parameters()) +
                list(self.image_mha.parameters()),
                'lr':
                1e-4
            }])
        elif optimizer == 'adam':
            self.optimizer = torch.optim.Adam([{
                'params':
                list(self.bert_model.parameters()),
                'lr':
                3e-5
            }, {
                'params':
                list(self.image_encoder.parameters()) +
                list(self.text_encoder.parameters()) +
                list(self.image_mha.parameters()),
                'lr':
                1e-4
            }])

            # self.optimizer = torch.optim.Adam([{'params':list(self.bert_model.parameters()),'lr':3e-5},
            #                     {'params':list(self.text_encoder.parameters()) + list(self.image_mha.parameters()),'lr':1e-4}])

        if self.use_lr_scheduler:
            self.lr_scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=100,
                num_training_steps=num_training_steps)
        self.lr_scheduler_0 = get_constant_schedule(self.optimizer)
        # loss
        self.mrl_loss = MarginRankingLoss(margin=margin_loss,
                                          max_violation=max_violation,
                                          cost_style=cost_style,
                                          direction='bidir')
예제 #21
0
    def __init__(self, config, pretrained=True):

        self.config = config
        self.model, self.vocab = build_model(config)

        self.device = config['device']
        self.num_iters = config['trainer']['iters']
        self.beamsearch = config['predictor']['beamsearch']

        self.data_root = config['dataset']['data_root']
        self.train_annotation = config['dataset']['train_annotation']
        self.valid_annotation = config['dataset']['valid_annotation']
        self.dataset_name = config['dataset']['name']

        self.batch_size = config['trainer']['batch_size']
        self.print_every = config['trainer']['print_every']
        self.valid_every = config['trainer']['valid_every']

        self.checkpoint = config['trainer']['checkpoint']
        self.export_weights = config['trainer']['export']
        self.metrics = config['trainer']['metrics']
        logger = config['trainer']['log']

        if logger:
            self.logger = Logger(logger)

        if pretrained:
            weight_file = download_weights(**config['pretrain'],
                                           quiet=config['quiet'])
            self.load_weights(weight_file)

        self.iter = 0

        self.optimizer = AdamW(self.model.parameters(),
                               betas=(0.9, 0.98),
                               eps=1e-09)
        self.scheduler = OneCycleLR(self.optimizer,
                                    total_steps=self.num_iters,
                                    **config['optimizer'])
        #        self.optimizer = ScheduledOptim(
        #            Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        #            #config['transformer']['d_model'],
        #            512,
        #            **config['optimizer'])

        self.criterion = LabelSmoothingLoss(len(self.vocab),
                                            padding_idx=self.vocab.pad,
                                            smoothing=0.1)

        transforms = ImgAugTransform()

        self.train_gen = self.data_gen('train_{}'.format(self.dataset_name),
                                       self.data_root,
                                       self.train_annotation,
                                       transform=transforms)
        if self.valid_annotation:
            self.valid_gen = self.data_gen(
                'valid_{}'.format(self.dataset_name), self.data_root,
                self.valid_annotation)

        self.train_losses = []
        nn.Linear(768, 1536), nn.Tanh(), nn.Linear(1536, 1536), nn.Tanh(),
        nn.Linear(1536, len(label_encoder.classes_)))
    model.classifier = classifier_head

    print(f"\t- Tokenizing data.")
    train_ds = tokenize_inputs(train_ds, args.text_col, tokenizer)
    test_ds = tokenize_inputs(test_ds, args.text_col, tokenizer)
    print(f"\t- Preparing inputs for training and evaluation.")
    train_ds = prepare_inputs(train_ds, args.text_col, args.label_col)
    test_ds = prepare_inputs(test_ds, args.text_col, args.label_col)

    warmup_steps = math.ceil((len(train_ds) / args.bs) * args.epochs *
                             0.1)  #10% of train data for warm-up
    train_steps = int(args.epochs * len(train_ds) / args.bs)

    optimizer = AdamW(model.parameters(), lr=args.lr)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=train_steps,
                                                num_cycles=0.5)

    training_args = TrainingArguments(
        output_dir=args.checkpoint_dir,  # output directory
        num_train_epochs=args.epochs,  # total number of training epochs
        per_device_train_batch_size=args.
        bs,  # batch size per device during training
        per_device_eval_batch_size=args.bs,  # batch size for evaluation
        # warmup_steps=warmup_steps,                                                                            # number of warmup steps for learning rate scheduler
        weight_decay=args.wd,  # strength of weight decay
        evaluation_strategy="epoch",  # evaluation interval
        logging_dir=args.checkpoint_dir,  # directory for storing logs
예제 #23
0
def main() -> None:
    global best_loss

    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    start_epoch = 0

    vcf_reader = VCFReader(args.train_data, args.classification_map,
                           args.chromosome, args.class_hierarchy)
    vcf_writer = vcf_reader.get_vcf_writer()
    train_dataset, validation_dataset = vcf_reader.get_datasets(
        args.validation_split)
    train_sampler = BatchByLabelRandomSampler(args.batch_size,
                                              train_dataset.labels)
    train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)

    if args.validation_split != 0:
        validation_sampler = BatchByLabelRandomSampler(
            args.batch_size, validation_dataset.labels)
        validation_loader = DataLoader(validation_dataset,
                                       batch_sampler=validation_sampler)

    kwargs = {
        'total_size': vcf_reader.positions.shape[0],
        'window_size': args.window_size,
        'num_layers': args.layers,
        'num_classes': len(vcf_reader.label_encoder.classes_),
        'num_super_classes': len(vcf_reader.super_label_encoder.classes_)
    }
    model = WindowedMLP(**kwargs)
    model.to(get_device(args))

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)

    #######
    if args.resume_path is not None:
        if os.path.isfile(args.resume_path):
            print("=> loading checkpoint '{}'".format(args.resume_path))
            checkpoint = torch.load(args.resume_path)
            if kwargs != checkpoint['model_kwargs']:
                raise ValueError(
                    'The checkpoint\'s kwargs don\'t match the ones used to initialize the model'
                )
            if vcf_reader.snps.shape[0] != checkpoint['vcf_writer'].snps.shape[
                    0]:
                raise ValueError(
                    'The data on which the checkpoint was trained had a different number of snp positions'
                )
            start_epoch = checkpoint['epoch']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume_path, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    #############

    if args.validate:
        validate(validation_loader, model,
                 nn.functional.binary_cross_entropy_with_logits,
                 len(vcf_reader.label_encoder.classes_),
                 len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf,
                 args)
        return

    for epoch in range(start_epoch, args.epochs + start_epoch):
        loss = train(train_loader, model,
                     nn.functional.binary_cross_entropy_with_logits, optimizer,
                     len(vcf_reader.label_encoder.classes_),
                     len(vcf_reader.super_label_encoder.classes_),
                     vcf_reader.maf, epoch, args)

        if epoch % args.save_freq == 0 or epoch == args.epochs + start_epoch - 1:
            if args.validation_split != 0:
                validation_loss = validate(
                    validation_loader, model,
                    nn.functional.binary_cross_entropy_with_logits,
                    len(vcf_reader.label_encoder.classes_),
                    len(vcf_reader.super_label_encoder.classes_),
                    vcf_reader.maf, args)
                is_best = validation_loss < best_loss
                best_loss = min(validation_loss, best_loss)
            else:
                is_best = loss < best_loss
                best_loss = min(loss, best_loss)

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'model_kwargs': kwargs,
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'vcf_writer': vcf_writer,
                    'label_encoder': vcf_reader.label_encoder,
                    'super_label_encoder': vcf_reader.super_label_encoder,
                    'maf': vcf_reader.maf
                }, is_best, args.chromosome, args.model_name, args.model_dir)
예제 #24
0
                             pin_memory=True)

model = InceptionI3d(157, in_channels=3, output_method='avg_pool')
model.load_state_dict(
    torch.load('../../data/external_models/i3d_rgb_charades.pt'))
model.replace_logits(2)
#model = nn.DataParallel(model)

model = model.to(DEVICE)

criterion_train = MixupBCELoss()
criterion_test = torch.nn.CrossEntropyLoss()

lr = 1e-3

optimizer = AdamW((p for p in model.parameters() if p.requires_grad), lr=lr)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=lr, steps_per_epoch=len(dataloader_train), epochs=EPOCHS)

for epoch in range(EPOCHS):
    print(f"\nEPOCH {epoch + 1} of {EPOCHS}")
    # TRAIN
    model.train()
    losses, accuracy = Am(), Am()
    for i, (x, y_true) in enumerate(dataloader_train):

        x = x.to(DEVICE)
        t = x.size(2)
        y_true = y_true.to(DEVICE)

        x, index, lam = cutmix_apply(x, CUTMIX_ALPHA)
예제 #25
0
    def __init__(self, input_size, output_size):
        super().__init__()
        self.layer1 = Mem(input_size, 10)
        self.layer2 = Mem(10, 10)
        self.layer3 = Mem(10, output_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return x

placeholders_ = [[col.empty() for col in st.beta_columns(1)] for x in range(5)]
placeholders = [[col.empty() for col in st.beta_columns(1)] for x in range(5)]
net = Net(12, 12)
optimizer = AdamW(net.parameters(), lr=0.001)
criterion = nn.MSELoss()

x_ = [torch.tensor([
    [0, 1, 0, 0],
    [1, 1, 1, 1],
    [0, 1, 0, 0],
], dtype=float), torch.tensor([
    [0, 0, 1, 0],
    [1, 1, 1, 1],
    [0, 0, 1, 0],
], dtype=float)]
y_ = [torch.tensor([
    [1, 0.5, 0.5, 1],
    [1, 0, 0, 0],
    [1, 0, 0, 0],
예제 #26
0
def train(hyp, opt, device,
          callbacks):  # hyp is path/to/hyp.yaml or hyp dictionary
    save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
        opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
    callbacks.run('on_pretrain_routine_start')

    # Directories
    w = save_dir / 'weights'  # weights dir
    (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir
    last, best = w / 'last.pt', w / 'best.pt'

    # Hyperparameters
    if isinstance(hyp, str):
        with open(hyp, errors='ignore') as f:
            hyp = yaml.safe_load(f)  # load hyps dict
    LOGGER.info(
        colorstr('hyperparameters: ') + ', '.join(f'{k}={v}'
                                                  for k, v in hyp.items()))

    # Save run settings
    if not evolve:
        with open(save_dir / 'hyp.yaml', 'w') as f:
            yaml.safe_dump(hyp, f, sort_keys=False)
        with open(save_dir / 'opt.yaml', 'w') as f:
            yaml.safe_dump(vars(opt), f, sort_keys=False)

    # Loggers
    data_dict = None
    if RANK in [-1, 0]:
        loggers = Loggers(save_dir, weights, opt, hyp,
                          LOGGER)  # loggers instance
        if loggers.wandb:
            data_dict = loggers.wandb.data_dict
            if resume:
                weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size

        # Register actions
        for k in methods(loggers):
            callbacks.register_action(k, callback=getattr(loggers, k))

    # Config
    plots = not evolve  # create plots
    cuda = device.type != 'cpu'
    init_seeds(1 + RANK)
    with torch_distributed_zero_first(LOCAL_RANK):
        data_dict = data_dict or check_dataset(data)  # check if None
    train_path, val_path = data_dict['train'], data_dict['val']
    nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if single_cls and len(
        data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(
        names
    ) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # check
    is_coco = isinstance(val_path, str) and val_path.endswith(
        'coco/val2017.txt')  # COCO dataset

    # Model
    check_suffix(weights, '.pt')  # check weights
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(LOCAL_RANK):
            weights = attempt_download(
                weights)  # download if not found locally
        ckpt = torch.load(weights, map_location='cpu'
                          )  # load checkpoint to CPU to avoid CUDA memory leak
        model = Model(cfg or ckpt['model'].yaml,
                      ch=3,
                      nc=nc,
                      anchors=hyp.get('anchors')).to(device)  # create
        exclude = [
            'anchor'
        ] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys
        csd = ckpt['model'].float().state_dict(
        )  # checkpoint state_dict as FP32
        csd = intersect_dicts(csd, model.state_dict(),
                              exclude=exclude)  # intersect
        model.load_state_dict(csd, strict=False)  # load
        LOGGER.info(
            f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}'
        )  # report
    else:
        model = Model(cfg, ch=3, nc=nc,
                      anchors=hyp.get('anchors')).to(device)  # create

    # Freeze
    freeze = [
        f'model.{x}.'
        for x in (freeze if len(freeze) > 1 else range(freeze[0]))
    ]  # layers to freeze
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            LOGGER.info(f'freezing {k}')
            v.requires_grad = False

    # Image size
    gs = max(int(model.stride.max()), 32)  # grid size (max stride)
    imgsz = check_img_size(opt.imgsz, gs,
                           floor=gs * 2)  # verify imgsz is gs-multiple

    # Batch size
    if RANK == -1 and batch_size == -1:  # single-GPU only, estimate best batch size
        batch_size = check_train_batch_size(model, imgsz)
        loggers.on_params_update({"batch_size": batch_size})

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
    LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    g0, g1, g2 = [], [], []  # optimizer parameter groups
    for v in model.modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):  # bias
            g2.append(v.bias)
        if isinstance(v, nn.BatchNorm2d):  # weight (no decay)
            g0.append(v.weight)
        elif hasattr(v, 'weight') and isinstance(
                v.weight, nn.Parameter):  # weight (with decay)
            g1.append(v.weight)

    if opt.optimizer == 'Adam':
        optimizer = Adam(g0, lr=hyp['lr0'],
                         betas=(hyp['momentum'],
                                0.999))  # adjust beta1 to momentum
    elif opt.optimizer == 'AdamW':
        optimizer = AdamW(g0, lr=hyp['lr0'],
                          betas=(hyp['momentum'],
                                 0.999))  # adjust beta1 to momentum
    else:
        optimizer = SGD(g0,
                        lr=hyp['lr0'],
                        momentum=hyp['momentum'],
                        nesterov=True)

    optimizer.add_param_group({
        'params': g1,
        'weight_decay': hyp['weight_decay']
    })  # add g1 with weight_decay
    optimizer.add_param_group({'params': g2})  # add g2 (biases)
    LOGGER.info(
        f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
        f"{len(g0)} weight (no decay), {len(g1)} weight, {len(g2)} bias")
    del g0, g1, g2

    # Scheduler
    if opt.cos_lr:
        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    else:
        lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf'
                                                                   ]  # linear
    scheduler = lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)

    # EMA
    ema = ModelEMA(model) if RANK in [-1, 0] else None

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # EMA
        if ema and ckpt.get('ema'):
            ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
            ema.updates = ckpt['updates']

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if resume:
            assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
        if epochs < start_epoch:
            LOGGER.info(
                f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs."
            )
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, csd

    # DP mode
    if cuda and RANK == -1 and torch.cuda.device_count() > 1:
        LOGGER.warning(
            'WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
            'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.'
        )
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and RANK != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        LOGGER.info('Using SyncBatchNorm()')

    # Trainloader
    train_loader, dataset = create_dataloader(
        train_path,
        imgsz,
        batch_size // WORLD_SIZE,
        gs,
        single_cls,
        hyp=hyp,
        augment=True,
        cache=None if opt.cache == 'val' else opt.cache,
        rect=opt.rect,
        rank=LOCAL_RANK,
        workers=workers,
        image_weights=opt.image_weights,
        quad=opt.quad,
        prefix=colorstr('train: '),
        shuffle=True)
    mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max())  # max label class
    nb = len(train_loader)  # number of batches
    assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'

    # Process 0
    if RANK in [-1, 0]:
        val_loader = create_dataloader(val_path,
                                       imgsz,
                                       batch_size // WORLD_SIZE * 2,
                                       gs,
                                       single_cls,
                                       hyp=hyp,
                                       cache=None if noval else opt.cache,
                                       rect=True,
                                       rank=-1,
                                       workers=workers * 2,
                                       pad=0.5,
                                       prefix=colorstr('val: '))[0]

        if not resume:
            labels = np.concatenate(dataset.labels, 0)
            # c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots:
                plot_labels(labels, names, save_dir)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset,
                              model=model,
                              thr=hyp['anchor_t'],
                              imgsz=imgsz)
            model.half().float()  # pre-reduce anchor precision

        callbacks.run('on_pretrain_routine_end')

    # DDP mode
    if cuda and RANK != -1:
        model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)

    # Model attributes
    nl = de_parallel(
        model).model[-1].nl  # number of detection layers (to scale hyps)
    hyp['box'] *= 3 / nl  # scale to layers
    hyp['cls'] *= nc / 80 * 3 / nl  # scale to classes and layers
    hyp['obj'] *= (imgsz / 640)**2 * 3 / nl  # scale to image size and layers
    hyp['label_smoothing'] = opt.label_smoothing
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(
        dataset.labels, nc).to(device) * nc  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb),
             100)  # number of warmup iterations, max(3 epochs, 100 iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    last_opt_step = -1
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0
               )  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    stopper = EarlyStopping(patience=opt.patience)
    compute_loss = ComputeLoss(model)  # init loss class
    callbacks.run('on_train_start')
    LOGGER.info(
        f'Image sizes {imgsz} train, {imgsz} val\n'
        f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
        f"Logging results to {colorstr('bold', save_dir)}\n"
        f'Starting training for {epochs} epochs...')
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        callbacks.run('on_train_epoch_start')
        model.train()

        # Update image weights (optional, single-GPU only)
        if opt.image_weights:
            cw = model.class_weights.cpu().numpy() * (
                1 - maps)**2 / nc  # class weights
            iw = labels_to_image_weights(dataset.labels,
                                         nc=nc,
                                         class_weights=cw)  # image weights
            dataset.indices = random.choices(range(dataset.n),
                                             weights=iw,
                                             k=dataset.n)  # rand weighted idx

        # Update mosaic border (optional)
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(3, device=device)  # mean losses
        if RANK != -1:
            train_loader.sampler.set_epoch(epoch)
        pbar = enumerate(train_loader)
        LOGGER.info(
            ('\n' + '%10s' * 7) %
            ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
        if RANK in (-1, 0):
            pbar = tqdm(
                pbar, total=nb,
                bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            callbacks.run('on_train_batch_start')
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [
                        hyp['warmup_bias_lr'] if j == 2 else 0.0,
                        x['initial_lr'] * lf(epoch)
                    ])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(
                            ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = nn.functional.interpolate(imgs,
                                                     size=ns,
                                                     mode='bilinear',
                                                     align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                loss, loss_items = compute_loss(
                    pred, targets.to(device))  # loss scaled by batch_size
                if RANK != -1:
                    loss *= WORLD_SIZE  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni - last_opt_step >= accumulate:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)
                last_opt_step = ni

            # Log
            if RANK in (-1, 0):
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  # (GB)
                pbar.set_description(('%10s' * 2 + '%10.4g' * 5) %
                                     (f'{epoch}/{epochs - 1}', mem, *mloss,
                                      targets.shape[0], imgs.shape[-1]))
                callbacks.run('on_train_batch_end', ni, model, imgs, targets,
                              paths, plots, opt.sync_bn)
                if callbacks.stop_training:
                    return
            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for loggers
        scheduler.step()

        if RANK in (-1, 0):
            # mAP
            callbacks.run('on_train_epoch_end', epoch=epoch)
            ema.update_attr(model,
                            include=[
                                'yaml', 'nc', 'hyp', 'names', 'stride',
                                'class_weights'
                            ])
            final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
            if not noval or final_epoch:  # Calculate mAP
                results, maps, _ = val.run(data_dict,
                                           batch_size=batch_size //
                                           WORLD_SIZE * 2,
                                           imgsz=imgsz,
                                           model=ema.ema,
                                           single_cls=single_cls,
                                           dataloader=val_loader,
                                           save_dir=save_dir,
                                           plots=False,
                                           callbacks=callbacks,
                                           compute_loss=compute_loss)

            # Update best mAP
            fi = fitness(np.array(results).reshape(
                1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi
            log_vals = list(mloss) + list(results) + lr
            callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness,
                          fi)

            # Save model
            if (not nosave) or (final_epoch and not evolve):  # if save
                ckpt = {
                    'epoch': epoch,
                    'best_fitness': best_fitness,
                    'model': deepcopy(de_parallel(model)).half(),
                    'ema': deepcopy(ema.ema).half(),
                    'updates': ema.updates,
                    'optimizer': optimizer.state_dict(),
                    'wandb_id':
                    loggers.wandb.wandb_run.id if loggers.wandb else None,
                    'date': datetime.now().isoformat()
                }

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                if (epoch > 0) and (opt.save_period >
                                    0) and (epoch % opt.save_period == 0):
                    torch.save(ckpt, w / f'epoch{epoch}.pt')
                del ckpt
                callbacks.run('on_model_save', last, epoch, final_epoch,
                              best_fitness, fi)

            # Stop Single-GPU
            if RANK == -1 and stopper(epoch=epoch, fitness=fi):
                break

            # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
            # stop = stopper(epoch=epoch, fitness=fi)
            # if RANK == 0:
            #    dist.broadcast_object_list([stop], 0)  # broadcast 'stop' to all ranks

        # Stop DPP
        # with torch_distributed_zero_first(RANK):
        # if stop:
        #    break  # must break all DDP ranks

        # end epoch ----------------------------------------------------------------------------------------------------
    # end training -----------------------------------------------------------------------------------------------------
    if RANK in (-1, 0):
        LOGGER.info(
            f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.'
        )
        for f in last, best:
            if f.exists():
                strip_optimizer(f)  # strip optimizers
                if f is best:
                    LOGGER.info(f'\nValidating {f}...')
                    results, _, _ = val.run(
                        data_dict,
                        batch_size=batch_size // WORLD_SIZE * 2,
                        imgsz=imgsz,
                        model=attempt_load(f, device).half(),
                        iou_thres=0.65 if is_coco else
                        0.60,  # best pycocotools results at 0.65
                        single_cls=single_cls,
                        dataloader=val_loader,
                        save_dir=save_dir,
                        save_json=is_coco,
                        verbose=True,
                        plots=True,
                        callbacks=callbacks,
                        compute_loss=compute_loss)  # val best model with plots
                    if is_coco:
                        callbacks.run('on_fit_epoch_end',
                                      list(mloss) + list(results) + lr, epoch,
                                      best_fitness, fi)

        callbacks.run('on_train_end', last, best, plots, epoch, results)
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")

    torch.cuda.empty_cache()
    return results
예제 #27
0
def main():
    anchors = [30, 54, 95]
    shuffle = not (args.no_shuffle)
    exp = args.exp
    warm_up_epoch = 3

    # Load and process data

    if args.fold:
        df_train = pd.read_csv(args.data_path +
                               'k_fold/official_train_fold%d.csv' %
                               (args.fold))
        df_val = pd.read_csv(args.data_path +
                             'k_fold/official_val_fold%d.csv' % (args.fold))
    else:
        df_train = pd.read_csv(args.data_path + 'official_train.csv')
        df_val = pd.read_csv(args.data_path + 'official_val.csv')

    train = df_train.image_path.to_list()
    val = df_val.image_path.to_list()
    if exp:
        y_train = df_train.anchor.to_list()
        y_val = df_val.anchor.to_list()
        reg_train_gt = df_train.exp_wind.to_list()
        reg_val_gt = df_val.exp_wind.to_list()
    else:
        y_train = df_train.wind_speed.to_list()
        y_val = df_val.wind_speed.to_list()

    train_transform, val_transform = get_transform(args.image_size)

    train_dataset = WindDataset(image_list=train,
                                target=y_train,
                                exp_target=reg_train_gt if exp else None,
                                transform=train_transform)

    val_dataset = WindDataset(image_list=val,
                              target=y_val,
                              exp_target=reg_val_gt if exp else None,
                              transform=val_transform)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=shuffle,
                              num_workers=args.num_workers,
                              drop_last=True)

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers,
                            drop_last=True)

    warm_loader = DataLoader(dataset=train_dataset,
                             batch_size=args.batch_size * 14,
                             shuffle=shuffle,
                             num_workers=args.num_workers,
                             drop_last=True)

    # Load model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    last_epoch = 0

    # model = ResNet50_BN_idea()
    if not exp:
        model = Effnet_Wind_B7()
        # model = Effnet_Wind_B5()
    else:
        model = Effnet_Wind_B5_exp_6()
    # model = ResNetExample()
    # if not exp:
    #     model = Seresnext_Wind()
    # else:
    #     model = Seresnext_Wind_Exp()

    # Optimizer
    if args.opt == 'radam':
        optimizer = RAdam(
            model.parameters(),
            lr=args.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=args.weight_decay,
        )
    elif args.opt == 'adamw':
        optimizer = AdamW(model.parameters(), args.lr)

    elif args.opt == 'adam':
        optimizer = Adam(model.parameters(),
                         args.lr,
                         weight_decay=args.weight_decay)
    else:
        optimizer = SGD(model.parameters(),
                        args.lr,
                        momentum=0.9,
                        nesterov=True,
                        weight_decay=args.weight_decay)

    if args.weights:
        # model.load_state_dict(torch.load(args.weights))
        last_epoch = extract_number(args.weights)
        try:
            checkpoint = torch.load(args.weights)
            model.load_state_dict(checkpoint['model_state_dict'])
            if checkpoint['pre_opt'] == args.opt:
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                print(optimizer)
        except:
            model.load_state_dict(torch.load(args.weights))
    else:
        model.apply(reset_m_batchnorm)

    model.to(device)

    # Loss function
    if exp:
        criterion = JointLoss2()
    else:
        criterion = RMSELoss()

    # generate log and visualization
    save_path = args.save_path

    log_cache = (args.batch_size, args.image_size, shuffle, exp)

    write_log(args.save_path, model, optimizer, criterion, log_cache)

    plot_dict = {'train': list(), 'val': list()}

    log_train_path = save_path + 'training_log.txt'
    plot_train_path = save_path + 'log.json'

    write_mode = 'w'

    if os.path.exists(log_train_path) and os.path.exists(plot_train_path):
        write_mode = 'a'
        with open(plot_train_path, 'r') as j:
            plot_dict = json.load(j)
            plot_dict['train'] = plot_dict['train'][:last_epoch]
            plot_dict['val'] = plot_dict['val'][:last_epoch]

    # Training
    print('Start warm up')
    model.freeze_except_last()
    for epoch in range(warm_up_epoch):
        warm_up(
            model=model,
            dataloader=warm_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
        )
    model.unfreeze()
    with open(log_train_path, write_mode) as f:
        for epoch in range(1, args.epoch + 1):
            print('Epoch:', epoch + last_epoch)
            f.write('Epoch: %d\n' % (epoch + last_epoch))
            loss = train_epoch(model=model,
                               dataloader=train_loader,
                               optimizer=optimizer,
                               criterion=criterion,
                               device=device,
                               exp=exp)
            RMSE = val_epoch(model=model,
                             dataloader=val_loader,
                             device=device,
                             exp=exp,
                             anchors=anchors)
            if not exp:
                f.write('Training loss: %.4f\n' % (loss))
                f.write('RMSE val: %.4f\n' % (RMSE))
                print('RMSE loss: %.4f' % (loss))
                print('RMSE val: %.4f' % (RMSE))
            else:
                loss, classify, regress = loss
                RMSE, accuracy = RMSE
                f.write('Training loss: %.4f\n' % (loss))
                f.write('Classification loss: %.4f\n' % (classify))
                f.write('Regression loss: %.4f\n' % (regress))
                f.write('Accuracy val: %.4f\n' % (accuracy))
                f.write('RMSE val: %.4f\n' % (RMSE))
                print('Training loss: %.4f' % (loss))
                print('Classification loss: %.4f' % (classify))
                print('Regression loss: %.4f' % (regress))
                print('Accuracy val: %.4f' % (accuracy))
                print('RMSE val: %.4f' % (RMSE))

            # torch.save(model.state_dict(), save_path + 'epoch%d.pth'%(epoch+last_epoch))
            save_name = save_path + 'epoch%d.pth' % (epoch + last_epoch)
            save_pth(save_name, epoch + last_epoch, model, optimizer, args.opt)

            plot_dict['train'].append(loss)
            plot_dict['val'].append(RMSE)
            with open(plot_train_path, 'w') as j:
                json.dump(plot_dict, j)
예제 #28
0
def main():
    logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout), logging.FileHandler('logs/log'+date_now()+'.log')],
    )
    logger.setLevel(logging.INFO)

    epoch = 10
    batch_size = 64
    data = pd.read_pickle(os.path.join(COMMENT_DIR, 'comment_continue_train_balance.pkl'))
    val_data = pd.read_csv('/ai/223/person/lichunyu/datasets/kaggle/jigsaw/rate/validation_data.csv')
    tokenizer = BertTokenizer.from_pretrained('/ai/223/person/lichunyu/pretrain-models/bert-base-uncased')

    model = BertRegress()
    dataset = JigsawDataset(data, tokenizer)
    less_val_dataset = JigsawValDataset(val_data, 'less_toxic', tokenizer)
    more_val_dataset = JigsawValDataset(val_data, 'more_toxic', tokenizer)
    train_dataloader = DataLoader(dataset, batch_size=batch_size)
    less_val_dataloader = DataLoader(less_val_dataset, batch_size=batch_size)
    more_val_dataloader = DataLoader(more_val_dataset, batch_size=batch_size)
    # optimizer = SGD(model.parameters(), lr=4e-4, weight_decay=2)
    optimizer = AdamW(
        [
            {'params': model.bert.parameters()},
            {'params': model.regress.parameters(), 'lr':5e-4}
        ],
        lr=5e-5,
    )

    model.cuda()

    for e in range(epoch):

        model.train()
        train_total_loss = 0
        step = 0
        for n, batch in enumerate(tqdm(train_dataloader)):
            model.zero_grad()
            step += 1
            input_ids = batch[0].cuda()
            attention_mask = batch[1].cuda()
            y = batch[2].cuda()
            model_output = model(input_ids, attention_mask, y)
            loss = model_output['loss']
            train_total_loss += loss.item()
            if (n % 50) == 0:
                logger.info(f'the loss of batch {n} is {loss.item()}')
            loss.backward()
            optimizer.step()

        logger.info('train step loss is {}'.format(train_total_loss/step))


        model.eval()
        less_toxic_scores = np.array([])
        more_toxic_scores = np.array([])
        for batch in tqdm(less_val_dataloader):
            input_ids = batch[0].cuda()
            attention_mask = batch[1].cuda()
            with torch.no_grad():
                model_output = model(input_ids, attention_mask)
                score = model_output['output']
                score = score.detach().clone().cpu().numpy().flatten()
                less_toxic_scores = np.append(less_toxic_scores, score)

        for batch in tqdm(more_val_dataloader):
            input_ids = batch[0].cuda()
            attention_mask = batch[1].cuda()
            with torch.no_grad():
                model_output = model(input_ids, attention_mask)
                score = model_output['output']
                score = score.detach().clone().cpu().numpy().flatten()
                more_toxic_scores = np.append(more_toxic_scores, score)

        acc_item = (less_toxic_scores < more_toxic_scores).sum()
        logger.info(f'~~~~~~ Acc item is {acc_item}  ~~~~~~~')
        acc = acc_item / len(less_toxic_scores)
        logger.info(f'~~~~~~ Acc score is {acc}  ~~~~~~~')

        current_ckpt = os.path.join(COMMENT_MODEL_DIR, f'bert-epoch-{e}-acc-{acc}.pth')
        torch.save(model.state_dict(), current_ckpt)
예제 #29
0
        dog_noised = image_transforms["non_shape_transforms"](dog)
        dog = image_transforms["crop_224"](dog)
        dog_noised = image_transforms["crop_224"](dog_noised)
        dog_pt = image_transforms["to_pytorch"](dog)
        dog_noised_pt = image_transforms["to_pytorch"](dog_noised)

        with torch.no_grad():
            reconstructed = model.student(dog_noised_pt.unsqueeze(0), dog_pt.unsqueeze(0))["reconstruct"].squeeze(0)
            reconstructed = image_transforms["from_pytorch"](reconstructed)

        dog.show()
        dog_noised.show()
        reconstructed.show()

    all_params = list(filter(lambda p: p.requires_grad, model.student.parameters()))
    optimizer = AdamW(all_params, lr=lr, eps=1e-6, weight_decay=1e-2)
    torch.autograd.set_detect_anomaly(True)
    optimizer.zero_grad()

    try:
        from torch.cuda.amp import GradScaler, autocast

        scaler = GradScaler()
    except:
        pass
    if forward_only:
        _ = model.eval()
    else:
        _ = model.train()

    def get_unused_params(model):
예제 #30
0
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        params = model.parameters()
        if args.svo:
            params = [p for p in params if p.requires_grad]
        if args.fused_adam:
            args.opt_level = "O1"
            args.loss_scale = None
            args.keep_batchnorm_fp32 = None
            optim = FusedAdam(params,
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        else:
            args.keep_batchnorm_fp32 = None
            optim = AdamW(params, lr=args.learning_rate, eps=args.adam_epsilon)

        if args.loss_scale == 0:
            args.loss_scale = None

        if args.opt_level == "O1":
            args.keep_batchnorm_fp32 = None
            args.loss_scale = "dynamic"

        model, optim = amp.initialize(
            model,
            optim,
            opt_level=args.opt_level,
            loss_scale=args.loss_scale,
            keep_batchnorm_fp32=args.keep_batchnorm_fp32)
    else: