#----------------------------------------- # Tensorboard writer: tb_writer = SummaryWriter(log_dir=f'{logs_path}/{datetime.now().strftime("%d%m%Y-%H_%M_%S")}/') #----------------------------------------- #----------------------------------------- # Creating the optimiser: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': mlp.parameters(), 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) #----------------------------------------- #----------------------------------------- # Loading the contents of the auxiliary checkpoint and instantiating the contents if not resuming: if aux_checkpoint: global_step = aux_checkpoint['global_step'] epoch = aux_checkpoint['epoch'] optimizer.load_state_dict(aux_checkpoint['optimizer']) best_acc = aux_checkpoint['best_acc'] mlp.load_state_dict(aux_checkpoint['mlp_state_dict']) best_checkpoint_path = aux_checkpoint['best_checkpoint_path'] else: global_step = 0 best_acc = 0.0 epoch = 0
def trainner(model, play_history, train_config: dict): model.train() train_history, valid_history, split_point = play_history.get_train_valid_data(rate=train_config['traindata_rate']) train_dataset = AlphaDataset(play_histry=train_history) train_loader = DataLoader( dataset=train_dataset, batch_size=train_config['batch_size'], shuffle=True, num_workers=train_config['num_workers'], collate_fn=AlphaDataset.collate_fn, pin_memory=True, ) if valid_history is not None: valid_dataset = AlphaDataset(play_histry=valid_history) valid_loader = DataLoader( dataset=valid_dataset, batch_size=train_config['batch_size'] * 2, shuffle=False, num_workers=train_config['num_workers'], collate_fn=AlphaDataset.collate_fn, pin_memory=True, ) else: valid_loader = None optimizer = AdamW(params=model.parameters(), lr=train_config['base_lr']) scheduler = lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=train_config['epochs'], eta_min=train_config['min_lr'] ) for epoch in range(train_config['epochs']): train_value_mean = Avg() train_policy_mean = Avg() for states, actions, winners in train_loader: optimizer.zero_grad() value, policy = model(states) value_loss = functional.mse_loss(input=value.view(-1), target=winners) policy_loss = functional.cross_entropy(input=policy, target=actions) loss = train_config['value_loss_weight'] * value_loss + train_config['policy_loss_weight'] * policy_loss loss.backward() optimizer.step() train_value_mean.update(value=value_loss.item()) train_policy_mean.update(value=policy_loss.item()) scheduler.step() if valid_loader is not None: valid_value_mean = Avg() valid_policy_mean = Avg() for states, actions, winners in valid_loader: with torch.no_grad(): value, policy = model(states) value_loss = functional.mse_loss(input=value.view(-1), target=winners) policy_loss = functional.cross_entropy(input=policy, target=actions) value_loss = value_loss.item() policy_loss = policy_loss.item() valid_value_mean.update(value=value_loss) valid_policy_mean.update(value=policy_loss) msg = f'epochs: [{epoch}/{train_config["epochs"]}]' msg += f' - train value loss: {train_value_mean():.6f} - train policy loss: {train_policy_mean():.6f}' if valid_loader is not None: msg += f' - valid value loss: {valid_value_mean():.6f} - valid policy loss: {valid_policy_mean():.6f}' logging.info(msg=msg) model.eval()
print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") lr = 0.00002 # bioBERT: tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1") pretrained_model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1') pretrained_model.cuda() pretrained_model.eval() optimizer = AdamW(pretrained_model.parameters(), lr=lr, eps=1e-8) data = [] f = open('corpus.txt', 'r') for line in f: data.append(line) input_ids = [] attention_masks = [] for sentence in data: # encoded_sentence = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0) encoded_sentence = tokenizer.encode_plus( sentence, add_special_tokens=True, max_length=128, truncation=True,
def __init__(self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info('Initializing Distiller') self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0. self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info(f'Using MLM loss for LM step.') self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor( [params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to( f'cuda:{params.local_rank}' ) if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to( f'cuda:{params.local_rank}' ) if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info(f'Using CLM loss for LM step.') self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.: self.last_loss_mse = 0 if self.alpha_cos > 0.: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) if self.alpha_mse > 0.: self.mse_loss_fct = nn.MSELoss(reduction='sum') if self.alpha_cos > 0.: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean') logger.info('--- Initializing model optimizer') assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = int( self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': params.weight_decay }, { 'params': [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] logger.info( "------ Number of trainable parameters (student): %i" % sum([ p.numel() for p in self.student.parameters() if p.requires_grad ])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info( f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info( "Using apex.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info( "Using nn.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True) self.is_master = params.is_master if self.is_master: logger.info('--- Initializing Tensorboard') self.tensorboard = SummaryWriter( log_dir=os.path.join(self.dump_path, 'log', 'train')) self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
def main(rank, args): # Distributed setup if args.distributed: setup_distributed(rank, args.world_size) not_main_rank = args.distributed and rank != 0 logging.info("Start time: %s", datetime.now()) # Explicitly set seed to make sure models created in separate processes # start from same random weights and biases torch.manual_seed(args.seed) # Empty CUDA cache torch.cuda.empty_cache() # Change backend for flac files torchaudio.set_audio_backend("soundfile") # Transforms melkwargs = { "n_fft": args.win_length, "n_mels": args.n_bins, "hop_length": args.hop_length, } sample_rate_original = 16000 if args.type == "mfcc": transforms = torch.nn.Sequential( torchaudio.transforms.MFCC( sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs, ), ) num_features = args.n_bins elif args.type == "waveform": transforms = torch.nn.Sequential(UnsqueezeFirst()) num_features = 1 else: raise ValueError("Model type not supported") if args.normalize: transforms = torch.nn.Sequential(transforms, Normalize()) augmentations = torch.nn.Sequential() if args.freq_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.FrequencyMasking( freq_mask_param=args.freq_mask), ) if args.time_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask), ) # Text preprocessing char_blank = "*" char_space = " " char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) # Dataset training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], [transforms, transforms], language_model, root=args.dataset_root, folder_in_archive=args.dataset_folder_in_archive, ) # Decoder if args.decoder == "greedy": decoder = GreedyDecoder() else: raise ValueError("Selected decoder not supported") # Model model = Wav2Letter( num_classes=language_model.length, input_type=args.type, num_features=num_features, ) if args.jit: model = torch.jit.script(model) if args.distributed: n = torch.cuda.device_count() // args.world_size devices = list(range(rank * n, (rank + 1) * n)) model = model.to(devices[0]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) else: devices = ["cuda" if torch.cuda.is_available() else "cpu"] model = model.to(devices[0], non_blocking=True) model = torch.nn.DataParallel(model) n = count_parameters(model) logging.info("Number of parameters: %s", n) # Optimizer if args.optimizer == "adadelta": optimizer = Adadelta( model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, eps=args.eps, rho=args.rho, ) elif args.optimizer == "sgd": optimizer = SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adam": optimizer = Adam( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adamw": optimizer = AdamW( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) else: raise ValueError("Selected optimizer not supported") if args.scheduler == "exponential": scheduler = ExponentialLR(optimizer, gamma=args.gamma) elif args.scheduler == "reduceonplateau": scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3) else: raise ValueError("Selected scheduler not supported") criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) # Data Loader collate_fn_train = collate_factory(model_length_function, augmentations) collate_fn_valid = collate_factory(model_length_function) loader_training_params = { "num_workers": args.workers, "pin_memory": True, "shuffle": True, "drop_last": True, } loader_validation_params = loader_training_params.copy() loader_validation_params["shuffle"] = False loader_training = DataLoader( training, batch_size=args.batch_size, collate_fn=collate_fn_train, **loader_training_params, ) loader_validation = DataLoader( validation, batch_size=args.batch_size, collate_fn=collate_fn_valid, **loader_validation_params, ) # Setup checkpoint best_loss = 1.0 load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) if args.distributed: torch.distributed.barrier() if load_checkpoint: logging.info("Checkpoint: loading %s", args.checkpoint) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] best_loss = checkpoint["best_loss"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) logging.info("Checkpoint: loaded '%s' at epoch %s", args.checkpoint, checkpoint["epoch"]) else: logging.info("Checkpoint: not found") save_checkpoint( { "epoch": args.start_epoch, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, False, args.checkpoint, not_main_rank, ) if args.distributed: torch.distributed.barrier() torch.autograd.set_detect_anomaly(False) for epoch in range(args.start_epoch, args.epochs): logging.info("Epoch: %s", epoch) train_one_epoch( model, criterion, optimizer, scheduler, loader_training, decoder, language_model, devices[0], epoch, args.clip_grad, not_main_rank, not args.reduce_lr_valid, ) loss = evaluate( model, criterion, loader_validation, decoder, language_model, devices[0], epoch, not_main_rank, ) if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(loss) is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, is_best, args.checkpoint, not_main_rank, ) logging.info("End time: %s", datetime.now()) if args.distributed: torch.distributed.destroy_process_group()
def main(): # my dice shows 777 only. period. random.seed(EXPCONF.seed) np.random.seed(EXPCONF.seed) torch.manual_seed(EXPCONF.seed) torch.cuda.manual_seed_all(EXPCONF.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tempconf = EXPCONF.copy() tempconf.datamode = 'test' testloader, ___, _____ = get_loader(tempconf) trainloader, __, _trainds = get_loader(EXPCONF, getdev=False) devloader, _, _devds = get_loader(EXPCONF, getdev=True) assert len(trainloader) > 0, f"trainloader is empty!" assert len(devloader) > 0, f"devloader is empty!" # this is disgraceful.... but just specify things below model_weight, vocab, trained_condition = loadmodel_info(EXPCONF) albertconf = retrieve_conf(trained_condition, vocab) albert = AlbertForPreTraining(albertconf) albert.load_state_dict(model_weight) albert = albert.to(device) global_step = 0 L = len(trainloader) bsz = len(trainloader[0]) if not EXPCONF.infer_now: albert = albert.albert albert.eval() # freeze cls = MLP(EXPCONF, albertconf.hidden_size, 2).to(device) cls.train() for p in cls.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # huggingface example is doing this for language modeling... # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py optimizer = AdamW(cls.parameters(), lr=EXPCONF.cls_lr) # otherwise, use default getsch = get_cosine_schedule_with_warmup if EXPCONF.cls_sch == 'cosine' else get_linear_schedule_with_warmup scheduler = getsch(optimizer, EXPCONF.cls_warmups, EXPCONF.cls_numsteps) ## train cls only! while global_step < EXPCONF.cls_numsteps: lossep_pp = 0 accep_pp = 0 cls.train() for i, (b, l, datasetids) in enumerate( tqdm(trainloader, desc="iterations progress"), 1): outputs = albert(**b, return_dict=True) global_step += 1 logits = cls(outputs.pooler_output) losspp = F.cross_entropy(logits, l) lossppval = losspp.item() acc = accuracy(logits.clone().detach(), l) wandb.log({ 'step': global_step, 'cls.train_step/learning_rate': get_lr_from_optim(optimizer), 'cls.train_step/pp_loss': lossppval, 'cls.train_step/pp_acc': acc, }) optimizer.step() scheduler.step() cls.zero_grad() lossep_pp += lossppval accep_pp += acc if global_step % EXPCONF.logevery == 0: lossep_pp /= L accep_pp /= L wandb.log({ 'cls.train_ep/pp_loss': lossep_pp, 'cls.train_ep/pp_acc': accep_pp, }) devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls, devloader, global_step) if devpp_acc > EXPCONF.savethld: savemodel(EXPCONF, albert, cls, vocab, global_step, acc=devpp_acc) write_sub(EXPCONF, albert, cls, global_step, acc=devpp_acc, testloader=testloader) else: # infer now cls = None devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls, devloader, global_step, infernow=EXPCONF.infer_now) write_sub(EXPCONF, albert, cls, global_step, acc=devpp_acc, testloader=testloader, infernow=EXPCONF.infer_now) return None
bertlm_t.load_state_dict(torch.load(f'./bert_pytorch/model/model_saved/bert.ep248.mdl')) bertlm.bert.embedding = bertlm_t.bert.embedding bertlm_t.eval() for param in bertlm_t.parameters(): param.requires_grad = False if start_epoch != 0: try: bertlm.load_state_dict(torch.load(f'./bert_pytorch/model/model_saved_kd/bert_kd.ep{start_epoch-1}.mdl')) except: raise Exception("No File detected") optimizer = AdamW(bertlm.bert.encoder.parameters(), lr=1e-4) criteria = nn.CrossEntropyLoss() kd_criteria = FSPLoss(t_layer=12, s_layer=6, stride=2) params = filter(lambda p: p.requires_grad, bertlm.parameters()) num_params = sum([np.prod(p.size()) for p in params]) print("# of params:", num_params) cuda = True loss_list = [] batch_size = args.batch_size epoch = args.epoch mask_only = True if args.mask_only=='True' else False print(f"mask_only {mask_only}")
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=1e-5, **kwargs): if optimizer_name.lower() == "sgd": return SGD(parameters, lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, lr=learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, lr=learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, lr=learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, lr=learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "over9000": return Over9000(parameters, lr=learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # if optimizer_name.lower() == "ranger": # return Ranger(parameters, learning_rate, weight_decay=weight_decay, # **kwargs) # if optimizer_name.lower() == "qhadamw": # return QHAdamW(parameters, learning_rate, weight_decay=weight_decay, # **kwargs) # if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, adam_w_mode=True, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
def main(): global WANDB_STEP args = get_args() print(args) set_seed(args.seed) device = th.device("cpu" if args.devid < 0 else f"cuda:{args.devid}") args.device = device aux_device = th.device( "cpu" if args.aux_devid < 0 else f"cuda:{args.aux_devid}") args.aux_device = aux_device TEXT = torchtext.data.Field(batch_first=True) if args.dataset == "ptb": Dataset = PennTreebank elif args.dataset == "wikitext2": Dataset = WikiText2 train, valid, test = Dataset.splits( TEXT, newline_eos=True, ) TEXT.build_vocab(train) V = TEXT.vocab def batch_size_tokens(new, count, sofar): return max(len(new.text), sofar) def batch_size_sents(new, count, sofar): return count if args.iterator == "bucket": train_iter, valid_iter, test_iter = BucketIterator.splits( (train, valid, test), batch_sizes=[args.bsz, args.eval_bsz, args.eval_bsz], device=device, sort_key=lambda x: len(x.text), batch_size_fn=batch_size_tokens if args.bsz_fn == "tokens" else batch_size_sents, ) elif args.iterator == "bptt": train_iter, valid_iter, test_iter = BPTTIterator.splits( (train, valid, test), batch_sizes=[args.bsz, args.eval_bsz, args.eval_bsz], device=device, bptt_len=args.bptt, sort=False, ) else: raise ValueError(f"Invalid iterator {args.iterator}") if args.no_shuffle_train: train_iter.shuffle = False name = get_name(args) import tempfile wandb.init(project="hmm-lm", name=name, config=args, dir=tempfile.mkdtemp()) args.name = name model = None from models.factoredhmmlm import FactoredHmmLm model = FactoredHmmLm(V, args) model.to(device) print(model) num_params, num_trainable_params = count_params(model) print(f"Num params, trainable: {num_params:,}, {num_trainable_params:,}") wandb.run.summary["num_params"] = num_params if args.eval_only: model.load_state_dict(th.load(args.eval_only)["model"]) v_start_time = time.time() if args.model == "mshmm" or args.model == "factoredhmm": if args.num_classes > 2**15: eval_fn = mixed_cached_eval_loop else: eval_fn = cached_eval_loop elif args.model == "hmm": eval_fn = cached_eval_loop else: eval_fn = eval_loop valid_losses, valid_n = eval_fn( args, V, valid_iter, model, ) report(valid_losses, valid_n, f"Valid perf", v_start_time) t_start_time = time.time() test_losses, test_n = eval_fn( args, V, test_iter, model, ) report(test_losses, test_n, f"Test perf", t_start_time) sys.exit() parameters = list(model.parameters()) if args.optimizer == "adamw": optimizer = AdamW( parameters, lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.wd, ) elif args.optimizer == "sgd": optimizer = SGD( parameters, lr=args.lr, ) if args.schedule == "reducelronplateau": scheduler = ReduceLROnPlateau( optimizer, factor=1. / args.decay, patience=args.patience, verbose=True, mode="max", ) elif args.schedule == "noam": warmup_steps = args.warmup_steps def get_lr(step): scale = warmup_steps**0.5 * min(step**(-0.5), step * warmup_steps**(-1.5)) return args.lr * scale scheduler = LambdaLR( optimizer, get_lr, last_epoch=-1, verbse=True, ) else: raise ValueError("Invalid schedule options") for e in range(args.num_epochs): start_time = time.time() if args.log_counts > 0 and args.keep_counts > 0: # reset at START of epoch model.state_counts.fill_(0) train_losses, train_n = train_loop( args, V, train_iter, model, parameters, optimizer, scheduler, valid_iter=valid_iter if not args.overfit else None, verbose=True, ) total_time = report(train_losses, train_n, f"Train epoch {e}", start_time) v_start_time = time.time() if args.model == "mshmm" or args.model == "factoredhmm": if args.num_classes > 2**15: eval_fn = mixed_cached_eval_loop else: eval_fn = cached_eval_loop elif args.model == "hmm": eval_fn = cached_eval_loop else: eval_fn = eval_loop valid_losses, valid_n = eval_fn(args, V, valid_iter, model) report(valid_losses, valid_n, f"Valid epoch {e}", v_start_time) if args.schedule in valid_schedules: scheduler.step(valid_losses.evidence if not args.overfit else train_losses.evidence) update_best_valid(valid_losses, valid_n, model, optimizer, scheduler, args.name) wandb.log( { "train_loss": train_losses.evidence / train_n, "train_ppl": math.exp(-train_losses.evidence / train_n), "epoch_time": total_time, "valid_loss": valid_losses.evidence / valid_n, "valid_ppl": math.exp(-valid_losses.evidence / valid_n), "best_valid_loss": BEST_VALID / valid_n, "best_valid_ppl": math.exp(-BEST_VALID / valid_n), "epoch": e, }, step=WANDB_STEP) if args.log_counts > 0 and args.keep_counts > 0: counts = (model.counts / model.counts.sum(0, keepdim=True))[:, 4:] c, v = counts.shape cg2 = counts > 1e-2 # state counts # log these once per epoch, then set back to zero sc0 = (model.state_counts == 0).sum() sc1 = (model.state_counts == 1).sum() sc2 = (model.state_counts == 2).sum() sc3 = (model.state_counts == 3).sum() sc4 = (model.state_counts == 4).sum() sc5 = (model.state_counts >= 5).sum() wandb.log( { "avgcounts@1e-2": cg2.sum().item() / float(v), "maxcounts@1e-2": cg2.sum(0).max().item(), "mincounts@1e-2": cg2.sum(0).min().item(), "maxcounts": counts.sum(0).max().item(), "mincounts": counts.sum(0).min().item(), "statecounts=0": sc0, "statecounts=1": sc1, "statecounts=2": sc2, "statecounts=3": sc3, "statecounts=4": sc4, "statecounts>=5": sc5, }, step=WANDB_STEP) del cg2 del counts # won't use best model. Rerun with eval_only t_start_time = time.time() test_losses, test_n = eval_fn( args, V, test_iter, model, ) report(test_losses, test_n, f"Test perf", t_start_time)
num_workers=num_workers) valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers) loaders = {"train": train_loader, "valid": valid_loader} num_epochs = get_dict_value_or_default(config, 'epochs', 100) logdir = "/var/data/bengali" + str(args.fold) + '_config_' + str( args.config) + '_comment_' + args.comment lr = get_dict_value_or_default(dict_=config, key='lr', default_value=args.lr) if config['opt'] == 'adamw': optimizer = AdamW(params=model.parameters(), lr=lr) elif config['opt'] == 'adam': optimizer = Adam(params=model.parameters(), lr=lr) elif config['opt'] == 'sgd': optimizer = SGD(params=model.parameters(), lr=lr, momentum=0.9, nesterov=True) elif config['opt'] == 'rmsprop': optimizer = torch.optim.RMSprop(params=model.parameters(), lr=lr) elif config['opt'] == 'radam': optimizer = RAdam(params=model.parameters(), lr=lr) else: raise Exception(config['opt'] + ' is not supported') scheduler = make_scheduler_from_config(optimizer=optimizer, config=config)
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--mode', choices=['train', 'validate', 'predict'], default='train') arg('--run_root', default='.') arg('--batch-size', type=int, default=16) arg('--step', type=int, default=1) arg('--workers', type=int, default=0) arg('--lr', type=float, default=0.00003) arg('--adam_epsilon', type=float, default=1e-8) arg('--weight_decay', type=float, default=0.0) arg('--fold', type=int, default=0) arg('--warmup', type=float, default=0.05) arg('--limit', type=int) arg('--patience', type=int, default=1) arg('--clean', action='store_true') arg('--n-epochs', type=int, default=20) arg('--vocab-size', type=int, default=13318) arg('--multi-gpu', type=int, default=0) arg('--print-num', type=int, default=5) arg('--temperature', type=float) args = parser.parse_args() df = pd.read_table('../data/dialog-rewrite/corpus.txt', sep="\t\t", names=['a', 'b', 'current', 'label'], dtype=str) df.dropna(how='any', inplace=True) train_length = int(len(df) * 0.9) train_df = df.iloc[:train_length].iloc[:, :] valid_df = df.iloc[train_length:] print(valid_df.head()) if args.mode == 'predict': # valid_df['current'] = valid_df['label'] valid_df = pd.read_table('../data/dialog-rewrite/test.csv', sep=",", names=['a', 'b', 'current', 'label'], dtype=str) print(valid_df.tail()) valid_df['eval_label'] = valid_df['label'].apply( lambda x: ' '.join(list(x))) if args.limit: train_df = train_df.iloc[0:args.limit] valid_df = valid_df.iloc[0:args.limit] # train_df['len'] = train_df['content'].apply(lambda x: len(x)) run_root = Path('../experiments/' + args.run_root) tokenizer = BertTokenizer.from_pretrained("../rbt3") valid_set = TaggerRewriterDataset(valid_df, tokenizer, valid=True) valid_index = np.array(valid_set.valid_index) # np.save('index.npy', valid_index) valid_df = valid_df.reset_index().loc[valid_index, :] ner_index = np.array(valid_set.label_type) == 1 valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=tagger_collate_fn) config = BertConfig.from_json_file('../rbt3/config.json') config.num_labels = 5 # # config.is_decoder = True # decoder = BertModel.from_pretrained("../rbt3", config=config) # encoder = BertModel.from_pretrained("../rbt3") # args.vocab_size = config.vocab_size bert_path = '../rbt3' model = TaggerRewriteModel(config, bert_path) model.cuda() if args.mode == 'train': if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) (run_root / 'params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) train_set = TaggerRewriterDataset(train_df, tokenizer) # np.save('index.npy', train_set.valid_index) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=tagger_collate_fn) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = int(len(train_df) * args.n_epochs / args.batch_size) warmup_steps = int(t_total * args.warmup) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total # ) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) model, optimizer = amp.initialize(model, optimizer, opt_level='O2', verbosity=0) train(args, model, optimizer, scheduler, tokenizer, ner_index, train_loader=train_loader, valid_df=valid_df, valid_loader=valid_loader, epoch_length=len(train_df)) elif args.mode == 'validate': model_path = run_root / ('tagger_model-%d.pt' % args.fold) load_model(model, model_path) valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, ner_index, decode_mode='beam_search') elif args.mode == 'predict': model_path = run_root / ('tagger_model-%d.pt' % args.fold) load_model(model, model_path) valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, decode_mode='beam_search')
top1_acc = validate(val_loader, model) print("Epoch: {} Training Loss: {} Validation Top1 Acc: {}". format(epoch, loss, top1_acc)) # define model model = ViViT( image_size=224, # patch_size=16, # tubelet_temporal_size=2, # num_classes=51, # num_frames=32, # dim=768, # layer_spacial=12, # layer_temporal=4, # heads=12, # dim_head=64, dropout=0., emb_dropout=0, mlp_dim=3072, pretrain=True) parameters = filter(lambda p: p.requires_grad, model.parameters()) parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 print('Trainable Parameters: %.3fM' % parameters) #define optimizer, criterion criterion = nn.CrossEntropyLoss().to(device) optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-3) train(train_loader, model, criterion, optimizer, 200)
else: F1 = torch.jit.load(pt2_model.ILLUSTRATION2VEC) F2 = torch.jit.load(pt2_model.VGG16) S = pt2_model.Embedding(args.latent_dim) G = pt2_model.Generator(args.latent_dim, args.capacity) D = pt2_model.Discriminator(args.capacity) GP = pt2_model.GradientPenalty(D, λ2) MSE = nn.MSELoss() to_cuda(F1, F2, S, G, D, GP, MSE) to_eval(F1, F2) GS_parameters = list(G.parameters()) + list(S.parameters()) optim_GS = AdamW(GS_parameters, lr=α, betas=β) optim_D = AdamW(D.parameters(), lr=α, betas=β) # =============== # VALIDATION DATA # =============== _, v_composition, v_hints, v_style, v_illustration = dataset[7] c, h, w = v_composition.size() v_composition = v_composition.unsqueeze(0).cuda() v_hints = v_hints.unsqueeze(0).cuda() v_style = v_style.unsqueeze(0).cuda() v_illustration = v_illustration.unsqueeze(0).cuda() v_noise = torch.rand((1, 1, h, w)).cuda() with torch.no_grad():
def get_optimizer(args, model): args.warmup_steps = math.ceil(args.warmup_prop * args.max_train_steps) if args.optimizer == 'adamw-bert': no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = huggingfaceOptim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) debug_print('\n - Use Huggingface\'s AdamW Optimizer') elif args.optimizer == 'adamw-torch': try: from torch.optim import AdamW except ImportError as e: debug_print(f'torch version: {torch.__version__}') raise e no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1, args.beta2)) scheduler = huggingfaceOptim.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=args.max_train_steps) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adagrad': optimizer = torch.optim.Adagrad(model.parameters(), lr=args.learning_rate) scheduler = None elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps, weight_decay=args.weight_decay) scheduler = None elif args.rnn_optimizer == 'adamax': optimizer = torch.optim.Adamax(model.parameters()) # use default lr scheduler = None else: raise Exception("Unsupported optimizer: {}".format(args.optimizer)) return optimizer, scheduler
train_input_ids, train_attention_mask = tokenizer_data(train_sentences) test_input_ids, test_attention_mask = tokenizer_data(test_sentences) print("inputs id is ready") train_dataset = TensorDataset(torch.tensor(train_input_ids), torch.tensor(train_attention_mask), torch.tensor(train_labels)) train_loader = DataLoader(train_dataset, batch_size=64) test_dataset = TensorDataset(torch.tensor(test_input_ids), torch.tensor(test_attention_mask)) test_loader = DataLoader(test_dataset, batch_size=64) optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) epochs = 5 print("training") model.to(device) for epoch in range(0, epochs): model.train() for step, batch in enumerate(train_loader): train_input_ids = batch[0].to(device) train_attention_mask = batch[1].to(device) train_labels = batch[2].to(device) model.zero_grad() output = model(input_ids=train_input_ids, attention_mask=train_attention_mask, labels=train_labels)
def main(args): workdir = os.path.expanduser(args.training_directory) if os.path.exists(workdir) and not args.force: print("[error] %s exists." % workdir) exit(1) init(args.seed, args.device) device = torch.device(args.device) print("[loading data]") chunks, chunk_lengths, targets, target_lengths = load_data( limit=args.chunks, shuffle=True, directory=args.directory) split = np.floor(chunks.shape[0] * args.validation_split).astype(np.int32) train_dataset = ChunkDataSet(chunks[:split], chunk_lengths[:split], targets[:split], target_lengths[:split]) test_dataset = ChunkDataSet(chunks[split:], chunk_lengths[split:], targets[split:], target_lengths[split:]) train_loader = DataLoader(train_dataset, batch_size=args.batch, shuffle=True, num_workers=4, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=args.batch, num_workers=4, pin_memory=True) config = toml.load(args.config) argsdict = dict(training=vars(args)) print("[loading model]") model = Model(config) weights = os.path.join(workdir, 'weights.tar') if os.path.exists(weights): model.load_state_dict(torch.load(weights)) model.to(device) model.train() os.makedirs(workdir, exist_ok=True) toml.dump({ **config, **argsdict }, open(os.path.join(workdir, 'config.toml'), 'w')) optimizer = AdamW(model.parameters(), amsgrad=True, lr=args.lr) if args.amp: try: model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) except NameError: print( "[error]: Cannot use AMP: Apex package needs to be installed manually, See https://github.com/NVIDIA/apex" ) exit(1) schedular = CosineAnnealingLR(optimizer, args.epochs * len(train_loader)) for epoch in range(1, args.epochs + 1): try: train_loss, duration = train(model, device, train_loader, optimizer, use_amp=args.amp) val_loss, val_mean, val_median = test(model, device, test_loader) except KeyboardInterrupt: break print( "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%" .format(epoch, workdir, val_loss, val_mean, val_median)) torch.save(model.state_dict(), os.path.join(workdir, "weights_%s.tar" % epoch)) with open(os.path.join(workdir, 'training.csv'), 'a', newline='') as csvfile: csvw = csv.writer(csvfile, delimiter=',') if epoch == 1: csvw.writerow([ 'time', 'duration', 'epoch', 'train_loss', 'validation_loss', 'validation_mean', 'validation_median' ]) csvw.writerow([ datetime.today(), int(duration), epoch, train_loss, val_loss, val_mean, val_median, ]) schedular.step()
def main(): # 如果可以使用GPU运算,则使用GPU,否则使用CPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("Use " + str(device)) # 创建输出文件夹 if not os.path.exists(config.output_path): os.mkdir(config.output_path) # 创建dataset # create dataset file_list = None for path, dirs, files in os.walk(config.img_path, topdown=False): file_list = list(files) train_dataset = image_dataset(file_list, config.img_path, transform=get_transforms(config.img_size)) train_loader = DataLoader(dataset=train_dataset, batch_size=config.batchSize, shuffle=True) # 从model中获取判别器D和生成器G的网络模型 G_model = get_G_model(config.from_old_model, device, config.G_model_path) D_model = get_D_model(config.from_old_model, device, config.D_model_path) # 定义G和D的优化器,此处使用AdamW优化器 G_optimizer = AdamW(G_model.parameters(), lr=3e-4, weight_decay=1e-6) D_optimizer = AdamW(D_model.parameters(), lr=3e-4, weight_decay=1e-6) # 损失函数 criterion = config.criterion # 混合精度加速 if config.use_apex: G_model, G_optimizer = amp.initialize(G_model, G_optimizer, opt_level="O1") D_model, D_optimizer = amp.initialize(D_model, D_optimizer, opt_level="O1") # 记录训练时间 train_start = time.time() # 开始训练的每一个epoch for epoch in range(config.epochs): print("start epoch "+str(epoch+1)+":") # 定义一些变量用于记录进度和损失 batch_num = len(train_loader) D_loss_sum = 0 G_loss_sum = 0 count = 0 # 从dataloader中提取数据 for index, images in enumerate(train_loader): count += 1 # 将图片放入运算设备的内存 images = images.to(device) # 定义真标签,使用标签平滑的策略,生成0.9到1之间的随机数作为真实标签 # real_labels = (1 - torch.rand(config.batchSize, 1)/10).to(device) # 定义真标签,全1 # real_labels = Variable(torch.ones(config.batchSize, 1)).to(device) # 定义真标签,全0.9 real_labels = (Variable(torch.ones(config.batchSize, 1))-0.1).to(device) # 定义假标签,单向平滑,因此不对生成器标签进行平滑处理,全0 fake_labels = Variable(torch.zeros(config.batchSize, 1)).to(device) # 将随机的初始数据喂入生成器生成假图像 img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device) fake_images = G_model(img_seeds) # 记录真假标签是否被交换过 exchange_labels = False # 有一定概率在训练判别器时交换label if random.uniform(0, 1) < config.D_train_label_exchange: real_labels, fake_labels = fake_labels, real_labels exchange_labels = True # 训练判断器D D_optimizer.zero_grad() # 用真样本输入判别器 real_output = D_model(images) # 对于数据集末尾的数据,长度不够一个batch size时需要去除过长的真实标签 if len(real_labels) > len(real_output): D_loss_real = criterion(real_output, real_labels[:len(real_output)]) else: D_loss_real = criterion(real_output, real_labels) # 用假样本输入判别器 fake_output = D_model(fake_images) D_loss_fake = criterion(fake_output, fake_labels) # 将真样本与假样本损失相加,得到判别器的损失 D_loss = D_loss_real + D_loss_fake D_loss_sum += D_loss.item() # 重置优化器 D_optimizer.zero_grad() # 用损失更新判别器D if config.use_apex: with amp.scale_loss(D_loss, D_optimizer) as scaled_loss: scaled_loss.backward() else: D_loss.backward() D_optimizer.step() # 如果之前交换过标签,此时再换回来 if exchange_labels: real_labels, fake_labels = fake_labels, real_labels # 训练生成器G # 将随机种子数喂入生成器G生成假数据 img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device) fake_images = G_model(img_seeds) # 将假数据输入判别器 fake_output = D_model(fake_images) # 将假数据的判别结果与真实标签对比得到损失 G_loss = criterion(fake_output, real_labels) G_loss_sum += G_loss.item() # 重置优化器 G_optimizer.zero_grad() # 利用损失更新生成器G if config.use_apex: with amp.scale_loss(G_loss, G_optimizer) as scaled_loss: scaled_loss.backward() else: G_loss.backward() G_optimizer.step() # 打印程序工作进度 if (index + 1) % 200 == 0: print("Epoch: %2d, Batch: %4d / %4d" % (epoch + 1, index + 1, batch_num)) if (epoch+1) % 10 == 0: # 在每N个epoch结束时保存模型参数到磁盘文件 torch.save(G_model.state_dict(), config.G_model_path) torch.save(D_model.state_dict(), config.D_model_path) # 在每N个epoch结束时输出一组生成器产生的图片到输出文件夹 img_seeds = torch.randn(config.batchSize, config.img_seed_dim).to(device) fake_images = G_model(img_seeds).cuda().data # 将假图像缩放到[0,1]的区间 fake_images = 0.5 * (fake_images + 1) fake_images = fake_images.clamp(0, 1) # 连接所有生成的图片然后用自带的save_image()函数输出到磁盘文件 fake_images = fake_images.view(-1, 3, config.img_size, config.img_size) save_image(fake_images, config.output_path+str(epoch+1)+'.png') # 打印该epoch的损失,时间等数据用于参考 print("D_loss:", round(D_loss_sum / count, 3)) print("G_loss:", round(G_loss_sum / count, 3)) current_time = time.time() pass_time = int(current_time - train_start) time_string = str(pass_time // 3600) + " hours, " + str((pass_time % 3600) // 60) + " minutes, " + str( pass_time % 60) + " seconds." print("Time pass:"******"Done.")
# model = BertForQuestionAnswering.from_pretrained('bert-base-chinese') model = BertForQuestionAnswering.from_pretrained( 'hfl/chinese-roberta-wwm-ext-large') # model = BertForQuestionAnswering.from_pretrained('roberta_base_lm_finetune') # model = BertForQuestionAnswering.from_pretrained('roberta_large_lm_finetune') ranker = BertForSequenceClassification.from_pretrained( 'hfl/chinese-roberta-wwm-ext-large') # pdb.set_trace() model.to(device) ranker.to(device) optimizer = AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False) optimizer_rank = AdamW(ranker.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False) optimizer.zero_grad() optimizer_rank.zero_grad() # em_current, f1_current = validate(model) em_dev_best = 0
def main(): parser = ArgumentParser() parser.add_argument("--epoch", type=int, required=True) parser.add_argument("--seed", type=int, required=True) parser.add_argument("--emb_file", type=str, required=True) parser.add_argument("--checkpoint", type=str, required=True) parser.add_argument("--save_dir", type=str, required=True) parser.add_argument("--train_file", type=str, required=True) parser.add_argument("--log_file", type=str, required=False) parser.add_argument("--ratio", type=str, required=True) parser.add_argument("--vocab_size", type=int, required=True) parser.add_argument("--emb_size", type=int, required=True) parser.add_argument("--learning_rate", type=float, required=True) parser.add_argument("--batch_size", type=int, required=True) parser.add_argument("--max_length", type=int, required=True) parser.add_argument("--max_grad_norm", type=int, required=True) args = parser.parse_args() split_ratio = [float(val) for val in args.ratio.split(",")] has_cuda = torch.cuda.is_available() random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(filename=args.log_file, level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) logging.info("start preparing data") data_preprocessor = DataPreprocess() emb, word_idx_map = data_preprocessor.build_emb_vocab(args.emb_file) data_preprocessor.load(args.train_file, use_mask=False, is_test=False) train_dataset, dev_dataset = data_preprocessor.generate_train_dev_dataset( ratio=split_ratio) train_dataset, dev_dataset = CompDataSet( train_dataset, word_idx_map, max_len=args.max_length, emb_size=args.emb_size), CompDataSet(dev_dataset, word_idx_map, max_len=args.max_length, emb_size=args.emb_size) train_dataset = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) dev_dataset = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True) logging.info("init model") start_epoch = 0 if args.checkpoint: model = torch.load(args.checkpoint) start_epoch = re.findall("\d+(?=\_\d+.pt)", args.checkpoint) start_epoch = int(start_epoch[0]) + 1 else: model = ESIM(args.vocab_size, args.emb_size, emb, max_len=args.max_length) optimizer = AdamW(model.parameters(), lr=args.learning_rate) criterion = FocalLoss() if has_cuda: model = model.cuda() logging.info("start training") neg_auc, pos_auc = validate(model, dev_dataset) logging.info(f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}") for epoch in range(start_epoch, args.epoch): running_loss = 0.0 for step, data in enumerate(train_dataset): model.train() start_time = time.time() optimizer.zero_grad() outputs = model(data["premise"], data["premise_mask"], data["hypothese"], data["hypothese_mask"]) loss = criterion(outputs["probs"], data["label"]) loss.backward() clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() end_time = time.time() running_loss += loss.item() if step % 100 == 99: logging.info( f"epoch: {epoch}, step: {step}, time: {end_time - start_time} loss: {running_loss / 100}" ) running_loss = 0 if step % 500 == 499: neg_auc, pos_auc = validate(model, dev_dataset) logging.info( f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}") torch.save(model, Path(args.save_dir) / f"{epoch}_{step}.pt")
def __init__(self, image_encoder, text_encoder, image_mha, bert_model, optimizer='adam', lr=1e-3, l2_regularization=1e-2, margin_loss=1e-2, max_violation=True, cost_style='mean', use_lr_scheduler=False, grad_clip=0, num_training_steps=30000, device='cuda'): self.image_mha = image_mha self.image_encoder = image_encoder self.text_encoder = text_encoder self.bert_model = bert_model self.device = device self.use_lr_scheduler = use_lr_scheduler self.params = [] self.params = list(self.image_mha.parameters()) self.params += list(self.text_encoder.parameters()) self.params += list(self.image_encoder.parameters()) self.params += list(self.bert_model.parameters()) self.grad_clip = grad_clip self.frozen = False if optimizer == 'adamW': self.optimizer = AdamW([{ 'params': list(self.bert_model.parameters()), 'lr': 3e-5 }, { 'params': list(self.image_encoder.parameters()) + list(self.text_encoder.parameters()) + list(self.image_mha.parameters()), 'lr': 1e-4 }]) elif optimizer == 'adam': self.optimizer = torch.optim.Adam([{ 'params': list(self.bert_model.parameters()), 'lr': 3e-5 }, { 'params': list(self.image_encoder.parameters()) + list(self.text_encoder.parameters()) + list(self.image_mha.parameters()), 'lr': 1e-4 }]) # self.optimizer = torch.optim.Adam([{'params':list(self.bert_model.parameters()),'lr':3e-5}, # {'params':list(self.text_encoder.parameters()) + list(self.image_mha.parameters()),'lr':1e-4}]) if self.use_lr_scheduler: self.lr_scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=100, num_training_steps=num_training_steps) self.lr_scheduler_0 = get_constant_schedule(self.optimizer) # loss self.mrl_loss = MarginRankingLoss(margin=margin_loss, max_violation=max_violation, cost_style=cost_style, direction='bidir')
def __init__(self, config, pretrained=True): self.config = config self.model, self.vocab = build_model(config) self.device = config['device'] self.num_iters = config['trainer']['iters'] self.beamsearch = config['predictor']['beamsearch'] self.data_root = config['dataset']['data_root'] self.train_annotation = config['dataset']['train_annotation'] self.valid_annotation = config['dataset']['valid_annotation'] self.dataset_name = config['dataset']['name'] self.batch_size = config['trainer']['batch_size'] self.print_every = config['trainer']['print_every'] self.valid_every = config['trainer']['valid_every'] self.checkpoint = config['trainer']['checkpoint'] self.export_weights = config['trainer']['export'] self.metrics = config['trainer']['metrics'] logger = config['trainer']['log'] if logger: self.logger = Logger(logger) if pretrained: weight_file = download_weights(**config['pretrain'], quiet=config['quiet']) self.load_weights(weight_file) self.iter = 0 self.optimizer = AdamW(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09) self.scheduler = OneCycleLR(self.optimizer, total_steps=self.num_iters, **config['optimizer']) # self.optimizer = ScheduledOptim( # Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09), # #config['transformer']['d_model'], # 512, # **config['optimizer']) self.criterion = LabelSmoothingLoss(len(self.vocab), padding_idx=self.vocab.pad, smoothing=0.1) transforms = ImgAugTransform() self.train_gen = self.data_gen('train_{}'.format(self.dataset_name), self.data_root, self.train_annotation, transform=transforms) if self.valid_annotation: self.valid_gen = self.data_gen( 'valid_{}'.format(self.dataset_name), self.data_root, self.valid_annotation) self.train_losses = []
nn.Linear(768, 1536), nn.Tanh(), nn.Linear(1536, 1536), nn.Tanh(), nn.Linear(1536, len(label_encoder.classes_))) model.classifier = classifier_head print(f"\t- Tokenizing data.") train_ds = tokenize_inputs(train_ds, args.text_col, tokenizer) test_ds = tokenize_inputs(test_ds, args.text_col, tokenizer) print(f"\t- Preparing inputs for training and evaluation.") train_ds = prepare_inputs(train_ds, args.text_col, args.label_col) test_ds = prepare_inputs(test_ds, args.text_col, args.label_col) warmup_steps = math.ceil((len(train_ds) / args.bs) * args.epochs * 0.1) #10% of train data for warm-up train_steps = int(args.epochs * len(train_ds) / args.bs) optimizer = AdamW(model.parameters(), lr=args.lr) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps, num_cycles=0.5) training_args = TrainingArguments( output_dir=args.checkpoint_dir, # output directory num_train_epochs=args.epochs, # total number of training epochs per_device_train_batch_size=args. bs, # batch size per device during training per_device_eval_batch_size=args.bs, # batch size for evaluation # warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.wd, # strength of weight decay evaluation_strategy="epoch", # evaluation interval logging_dir=args.checkpoint_dir, # directory for storing logs
def main() -> None: global best_loss args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') start_epoch = 0 vcf_reader = VCFReader(args.train_data, args.classification_map, args.chromosome, args.class_hierarchy) vcf_writer = vcf_reader.get_vcf_writer() train_dataset, validation_dataset = vcf_reader.get_datasets( args.validation_split) train_sampler = BatchByLabelRandomSampler(args.batch_size, train_dataset.labels) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler) if args.validation_split != 0: validation_sampler = BatchByLabelRandomSampler( args.batch_size, validation_dataset.labels) validation_loader = DataLoader(validation_dataset, batch_sampler=validation_sampler) kwargs = { 'total_size': vcf_reader.positions.shape[0], 'window_size': args.window_size, 'num_layers': args.layers, 'num_classes': len(vcf_reader.label_encoder.classes_), 'num_super_classes': len(vcf_reader.super_label_encoder.classes_) } model = WindowedMLP(**kwargs) model.to(get_device(args)) optimizer = AdamW(model.parameters(), lr=args.learning_rate) ####### if args.resume_path is not None: if os.path.isfile(args.resume_path): print("=> loading checkpoint '{}'".format(args.resume_path)) checkpoint = torch.load(args.resume_path) if kwargs != checkpoint['model_kwargs']: raise ValueError( 'The checkpoint\'s kwargs don\'t match the ones used to initialize the model' ) if vcf_reader.snps.shape[0] != checkpoint['vcf_writer'].snps.shape[ 0]: raise ValueError( 'The data on which the checkpoint was trained had a different number of snp positions' ) start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume_path, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) ############# if args.validate: validate(validation_loader, model, nn.functional.binary_cross_entropy_with_logits, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, args) return for epoch in range(start_epoch, args.epochs + start_epoch): loss = train(train_loader, model, nn.functional.binary_cross_entropy_with_logits, optimizer, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, epoch, args) if epoch % args.save_freq == 0 or epoch == args.epochs + start_epoch - 1: if args.validation_split != 0: validation_loss = validate( validation_loader, model, nn.functional.binary_cross_entropy_with_logits, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, args) is_best = validation_loss < best_loss best_loss = min(validation_loss, best_loss) else: is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'model_kwargs': kwargs, 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'vcf_writer': vcf_writer, 'label_encoder': vcf_reader.label_encoder, 'super_label_encoder': vcf_reader.super_label_encoder, 'maf': vcf_reader.maf }, is_best, args.chromosome, args.model_name, args.model_dir)
pin_memory=True) model = InceptionI3d(157, in_channels=3, output_method='avg_pool') model.load_state_dict( torch.load('../../data/external_models/i3d_rgb_charades.pt')) model.replace_logits(2) #model = nn.DataParallel(model) model = model.to(DEVICE) criterion_train = MixupBCELoss() criterion_test = torch.nn.CrossEntropyLoss() lr = 1e-3 optimizer = AdamW((p for p in model.parameters() if p.requires_grad), lr=lr) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=lr, steps_per_epoch=len(dataloader_train), epochs=EPOCHS) for epoch in range(EPOCHS): print(f"\nEPOCH {epoch + 1} of {EPOCHS}") # TRAIN model.train() losses, accuracy = Am(), Am() for i, (x, y_true) in enumerate(dataloader_train): x = x.to(DEVICE) t = x.size(2) y_true = y_true.to(DEVICE) x, index, lam = cutmix_apply(x, CUTMIX_ALPHA)
def __init__(self, input_size, output_size): super().__init__() self.layer1 = Mem(input_size, 10) self.layer2 = Mem(10, 10) self.layer3 = Mem(10, output_size) def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) return x placeholders_ = [[col.empty() for col in st.beta_columns(1)] for x in range(5)] placeholders = [[col.empty() for col in st.beta_columns(1)] for x in range(5)] net = Net(12, 12) optimizer = AdamW(net.parameters(), lr=0.001) criterion = nn.MSELoss() x_ = [torch.tensor([ [0, 1, 0, 0], [1, 1, 1, 1], [0, 1, 0, 0], ], dtype=float), torch.tensor([ [0, 0, 1, 0], [1, 1, 1, 1], [0, 0, 1, 0], ], dtype=float)] y_ = [torch.tensor([ [1, 0.5, 0.5, 1], [1, 0, 0, 0], [1, 0, 0, 0],
def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze callbacks.run('on_pretrain_routine_start') # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings if not evolve: with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Loggers data_dict = None if RANK in [-1, 0]: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.wandb: data_dict = loggers.wandb.data_dict if resume: weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Config plots = not evolve # create plots cuda = device.type != 'cpu' init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len( names ) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check is_coco = isinstance(val_path, str) and val_path.endswith( 'coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download( weights) # download if not found locally ckpt = torch.load(weights, map_location='cpu' ) # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = [ 'anchor' ] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict( ) # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info( f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}' ) # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create # Freeze freeze = [ f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0])) ] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}") g0, g1, g2 = [], [], [] # optimizer parameter groups for v in model.modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias g2.append(v.bias) if isinstance(v, nn.BatchNorm2d): # weight (no decay) g0.append(v.weight) elif hasattr(v, 'weight') and isinstance( v.weight, nn.Parameter): # weight (with decay) g1.append(v.weight) if opt.optimizer == 'Adam': optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum elif opt.optimizer == 'AdamW': optimizer = AdamW(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': g1, 'weight_decay': hyp['weight_decay'] }) # add g1 with weight_decay optimizer.add_param_group({'params': g2}) # add g2 (biases) LOGGER.info( f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups " f"{len(g0)} weight (no decay), {len(g1)} weight, {len(g2)} bias") del g0, g1, g2 # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] else: lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf' ] # linear scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Epochs start_epoch = ckpt['epoch'] + 1 if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.' if epochs < start_epoch: LOGGER.info( f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs." ) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning( 'WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.' ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader( train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in [-1, 0]: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr('val: '))[0] if not resume: labels = np.concatenate(dataset.labels, 0) # c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end') # DDP mode if cuda and RANK != -1: model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) # Model attributes nl = de_parallel( model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) stopper = EarlyStopping(patience=opt.patience) compute_loss = ComputeLoss(model) # init loss class callbacks.run('on_train_start') LOGGER.info( f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ callbacks.run('on_train_epoch_start') model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info( ('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size')) if RANK in (-1, 0): pbar = tqdm( pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in (-1, 0): mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in (-1, 0): # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights' ]) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None, 'date': datetime.now().isoformat() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0): torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 # stop = stopper(epoch=epoch, fitness=fi) # if RANK == 0: # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks # Stop DPP # with torch_distributed_zero_first(RANK): # if stop: # break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in (-1, 0): LOGGER.info( f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.' ) for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = val.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=True, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, plots, epoch, results) LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}") torch.cuda.empty_cache() return results
def main(): anchors = [30, 54, 95] shuffle = not (args.no_shuffle) exp = args.exp warm_up_epoch = 3 # Load and process data if args.fold: df_train = pd.read_csv(args.data_path + 'k_fold/official_train_fold%d.csv' % (args.fold)) df_val = pd.read_csv(args.data_path + 'k_fold/official_val_fold%d.csv' % (args.fold)) else: df_train = pd.read_csv(args.data_path + 'official_train.csv') df_val = pd.read_csv(args.data_path + 'official_val.csv') train = df_train.image_path.to_list() val = df_val.image_path.to_list() if exp: y_train = df_train.anchor.to_list() y_val = df_val.anchor.to_list() reg_train_gt = df_train.exp_wind.to_list() reg_val_gt = df_val.exp_wind.to_list() else: y_train = df_train.wind_speed.to_list() y_val = df_val.wind_speed.to_list() train_transform, val_transform = get_transform(args.image_size) train_dataset = WindDataset(image_list=train, target=y_train, exp_target=reg_train_gt if exp else None, transform=train_transform) val_dataset = WindDataset(image_list=val, target=y_val, exp_target=reg_val_gt if exp else None, transform=val_transform) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.num_workers, drop_last=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, drop_last=True) warm_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size * 14, shuffle=shuffle, num_workers=args.num_workers, drop_last=True) # Load model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') last_epoch = 0 # model = ResNet50_BN_idea() if not exp: model = Effnet_Wind_B7() # model = Effnet_Wind_B5() else: model = Effnet_Wind_B5_exp_6() # model = ResNetExample() # if not exp: # model = Seresnext_Wind() # else: # model = Seresnext_Wind_Exp() # Optimizer if args.opt == 'radam': optimizer = RAdam( model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay, ) elif args.opt == 'adamw': optimizer = AdamW(model.parameters(), args.lr) elif args.opt == 'adam': optimizer = Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True, weight_decay=args.weight_decay) if args.weights: # model.load_state_dict(torch.load(args.weights)) last_epoch = extract_number(args.weights) try: checkpoint = torch.load(args.weights) model.load_state_dict(checkpoint['model_state_dict']) if checkpoint['pre_opt'] == args.opt: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) print(optimizer) except: model.load_state_dict(torch.load(args.weights)) else: model.apply(reset_m_batchnorm) model.to(device) # Loss function if exp: criterion = JointLoss2() else: criterion = RMSELoss() # generate log and visualization save_path = args.save_path log_cache = (args.batch_size, args.image_size, shuffle, exp) write_log(args.save_path, model, optimizer, criterion, log_cache) plot_dict = {'train': list(), 'val': list()} log_train_path = save_path + 'training_log.txt' plot_train_path = save_path + 'log.json' write_mode = 'w' if os.path.exists(log_train_path) and os.path.exists(plot_train_path): write_mode = 'a' with open(plot_train_path, 'r') as j: plot_dict = json.load(j) plot_dict['train'] = plot_dict['train'][:last_epoch] plot_dict['val'] = plot_dict['val'][:last_epoch] # Training print('Start warm up') model.freeze_except_last() for epoch in range(warm_up_epoch): warm_up( model=model, dataloader=warm_loader, optimizer=optimizer, criterion=criterion, device=device, ) model.unfreeze() with open(log_train_path, write_mode) as f: for epoch in range(1, args.epoch + 1): print('Epoch:', epoch + last_epoch) f.write('Epoch: %d\n' % (epoch + last_epoch)) loss = train_epoch(model=model, dataloader=train_loader, optimizer=optimizer, criterion=criterion, device=device, exp=exp) RMSE = val_epoch(model=model, dataloader=val_loader, device=device, exp=exp, anchors=anchors) if not exp: f.write('Training loss: %.4f\n' % (loss)) f.write('RMSE val: %.4f\n' % (RMSE)) print('RMSE loss: %.4f' % (loss)) print('RMSE val: %.4f' % (RMSE)) else: loss, classify, regress = loss RMSE, accuracy = RMSE f.write('Training loss: %.4f\n' % (loss)) f.write('Classification loss: %.4f\n' % (classify)) f.write('Regression loss: %.4f\n' % (regress)) f.write('Accuracy val: %.4f\n' % (accuracy)) f.write('RMSE val: %.4f\n' % (RMSE)) print('Training loss: %.4f' % (loss)) print('Classification loss: %.4f' % (classify)) print('Regression loss: %.4f' % (regress)) print('Accuracy val: %.4f' % (accuracy)) print('RMSE val: %.4f' % (RMSE)) # torch.save(model.state_dict(), save_path + 'epoch%d.pth'%(epoch+last_epoch)) save_name = save_path + 'epoch%d.pth' % (epoch + last_epoch) save_pth(save_name, epoch + last_epoch, model, optimizer, args.opt) plot_dict['train'].append(loss) plot_dict['val'].append(RMSE) with open(plot_train_path, 'w') as j: json.dump(plot_dict, j)
def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout), logging.FileHandler('logs/log'+date_now()+'.log')], ) logger.setLevel(logging.INFO) epoch = 10 batch_size = 64 data = pd.read_pickle(os.path.join(COMMENT_DIR, 'comment_continue_train_balance.pkl')) val_data = pd.read_csv('/ai/223/person/lichunyu/datasets/kaggle/jigsaw/rate/validation_data.csv') tokenizer = BertTokenizer.from_pretrained('/ai/223/person/lichunyu/pretrain-models/bert-base-uncased') model = BertRegress() dataset = JigsawDataset(data, tokenizer) less_val_dataset = JigsawValDataset(val_data, 'less_toxic', tokenizer) more_val_dataset = JigsawValDataset(val_data, 'more_toxic', tokenizer) train_dataloader = DataLoader(dataset, batch_size=batch_size) less_val_dataloader = DataLoader(less_val_dataset, batch_size=batch_size) more_val_dataloader = DataLoader(more_val_dataset, batch_size=batch_size) # optimizer = SGD(model.parameters(), lr=4e-4, weight_decay=2) optimizer = AdamW( [ {'params': model.bert.parameters()}, {'params': model.regress.parameters(), 'lr':5e-4} ], lr=5e-5, ) model.cuda() for e in range(epoch): model.train() train_total_loss = 0 step = 0 for n, batch in enumerate(tqdm(train_dataloader)): model.zero_grad() step += 1 input_ids = batch[0].cuda() attention_mask = batch[1].cuda() y = batch[2].cuda() model_output = model(input_ids, attention_mask, y) loss = model_output['loss'] train_total_loss += loss.item() if (n % 50) == 0: logger.info(f'the loss of batch {n} is {loss.item()}') loss.backward() optimizer.step() logger.info('train step loss is {}'.format(train_total_loss/step)) model.eval() less_toxic_scores = np.array([]) more_toxic_scores = np.array([]) for batch in tqdm(less_val_dataloader): input_ids = batch[0].cuda() attention_mask = batch[1].cuda() with torch.no_grad(): model_output = model(input_ids, attention_mask) score = model_output['output'] score = score.detach().clone().cpu().numpy().flatten() less_toxic_scores = np.append(less_toxic_scores, score) for batch in tqdm(more_val_dataloader): input_ids = batch[0].cuda() attention_mask = batch[1].cuda() with torch.no_grad(): model_output = model(input_ids, attention_mask) score = model_output['output'] score = score.detach().clone().cpu().numpy().flatten() more_toxic_scores = np.append(more_toxic_scores, score) acc_item = (less_toxic_scores < more_toxic_scores).sum() logger.info(f'~~~~~~ Acc item is {acc_item} ~~~~~~~') acc = acc_item / len(less_toxic_scores) logger.info(f'~~~~~~ Acc score is {acc} ~~~~~~~') current_ckpt = os.path.join(COMMENT_MODEL_DIR, f'bert-epoch-{e}-acc-{acc}.pth') torch.save(model.state_dict(), current_ckpt)
dog_noised = image_transforms["non_shape_transforms"](dog) dog = image_transforms["crop_224"](dog) dog_noised = image_transforms["crop_224"](dog_noised) dog_pt = image_transforms["to_pytorch"](dog) dog_noised_pt = image_transforms["to_pytorch"](dog_noised) with torch.no_grad(): reconstructed = model.student(dog_noised_pt.unsqueeze(0), dog_pt.unsqueeze(0))["reconstruct"].squeeze(0) reconstructed = image_transforms["from_pytorch"](reconstructed) dog.show() dog_noised.show() reconstructed.show() all_params = list(filter(lambda p: p.requires_grad, model.student.parameters())) optimizer = AdamW(all_params, lr=lr, eps=1e-6, weight_decay=1e-2) torch.autograd.set_detect_anomaly(True) optimizer.zero_grad() try: from torch.cuda.amp import GradScaler, autocast scaler = GradScaler() except: pass if forward_only: _ = model.eval() else: _ = model.train() def get_unused_params(model):
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) params = model.parameters() if args.svo: params = [p for p in params if p.requires_grad] if args.fused_adam: args.opt_level = "O1" args.loss_scale = None args.keep_batchnorm_fp32 = None optim = FusedAdam(params, lr=args.learning_rate, eps=args.adam_epsilon) else: args.keep_batchnorm_fp32 = None optim = AdamW(params, lr=args.learning_rate, eps=args.adam_epsilon) if args.loss_scale == 0: args.loss_scale = None if args.opt_level == "O1": args.keep_batchnorm_fp32 = None args.loss_scale = "dynamic" model, optim = amp.initialize( model, optim, opt_level=args.opt_level, loss_scale=args.loss_scale, keep_batchnorm_fp32=args.keep_batchnorm_fp32) else: