def get_cwd(): try: return get_original_cwd() except AttributeError: return os.getcwd()
def __ddp_script_mode_setup(self): assert self.trainer.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = os.environ.get( 'MASTER_PORT', str(find_free_network_port())) # allow the user to pass the node rank node_rank = '0' node_rank = os.environ.get('NODE_RANK', node_rank) node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '') if len(gpu_ids) == 1: gpu_ids = f'{gpu_ids},' num_gpus = max(1, len(gpu_ids.split(','))) # set the flag for ddp scripts os.environ['PL_TRAINER_GPUS'] = gpu_ids os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.trainer.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) self.task_idx = 0
import hydra from utils import wrap_continuous_env, wrap_discrete_env, get_latest from omegaconf import DictConfig, OmegaConf from hydra.utils import instantiate, call, get_original_cwd OmegaConf.register_new_resolver("parse_string", lambda input : input.lower().replace(" ", "_")) OmegaConf.register_new_resolver("get_wrapper_func", lambda continuous : wrap_continuous_env if continuous else wrap_discrete_env) OmegaConf.register_new_resolver("original_dir", lambda relative_path : get_original_cwd() + relative_path) OmegaConf.register_new_resolver("get_latest", get_latest) OmegaConf.register_new_resolver("wandb_mode", lambda save : "online" if save else "disabled") @hydra.main(config_path="../config", config_name="config") def main(config: DictConfig): env = call(config.environment) callbacks = list(instantiate(config.callbacks)["callbacks"]) model = call(config.model, env) model.learn(total_timesteps=config.run["max_timesteps"], callback=callbacks) if config.run["save"]: model.save(config.model["save_path"]) if __name__ == "__main__": main()
def _call_children_scripts(self): # bookkeeping of spawned processes assert self.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True # DDP Environment variables os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") os.environ["MASTER_PORT"] = os.environ.get( "MASTER_PORT", str(find_free_network_port())) # allow the user to pass the node rank node_rank = "0" node_rank = os.environ.get("NODE_RANK", node_rank) node_rank = os.environ.get("GROUP_RANK", node_rank) os.environ["NODE_RANK"] = node_rank os.environ["LOCAL_RANK"] = "0" # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.parallel_devices is None: raise MisconfigurationException( "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)" ) os.environ["PL_TRAINER_GPUS"] = ",".join( [str(device.index) for device in self.parallel_devices]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" if self.lightning_module.logger is not None: os.environ["PL_EXP_VERSION"] = str( self.lightning_module.logger.version) num_gpus = len(self.parallel_devices) os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" self.interactive_ddp_procs = [] for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" # remove env var if global seed not set if os.environ.get( "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [ f'hydra.run.dir={os_cwd}', f'hydra.job.name=train_ddp_process_{local_rank}' ] proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def main(cfg): cwd = utils.get_original_cwd() # cwd = cwd[0:-5] cfg.cwd = cwd cfg.pos_size = 2 * cfg.pos_limit + 2 print(cfg.pretty()) # get predict instance instance = _get_predict_instance(cfg) data = [instance] # preprocess data data, rels = _preprocess_data(data, cfg) # model __Model__ = { 'cnn': models.PCNN, 'rnn': models.BiLSTM, 'transformer': models.Transformer, 'gcn': models.GCN, 'capsule': models.Capsule, 'lm': models.LM, } # 最好在 cpu 上预测 cfg.use_gpu = False if cfg.use_gpu and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') logger.info(f'device: {device}') model = __Model__[cfg.model_name](cfg) logger.info(f'model name: {cfg.model_name}') logger.info(f'\n {model}') model.load(cfg.fp, device=device) model.to(device) model.eval() x = dict() x['word'], x['lens'] = torch.tensor([data[0]['token2idx'] ]), torch.tensor([data[0]['seq_len']]) if cfg.model_name != 'lm': x['entity_pos'], x['attribute_value_pos'] = torch.tensor([ data[0]['entity_pos'] ]), torch.tensor([data[0]['attribute_value_pos']]) if cfg.model_name == 'cnn': if cfg.use_pcnn: x['pcnn_mask'] = torch.tensor([data[0]['entities_pos']]) if cfg.model_name == 'gcn': # 没找到合适的做 parsing tree 的工具,暂时随机初始化 adj = torch.empty(1, data[0]['seq_len'], data[0]['seq_len']).random_(2) x['adj'] = adj for key in x.keys(): x[key] = x[key].to(device) with torch.no_grad(): y_pred = model(x) y_pred = torch.softmax(y_pred, dim=-1)[0] prob = y_pred.max().item() prob_att = list(rels.keys())[y_pred.argmax().item()] logger.info( f"\"{data[0]['entity']}\" 和 \"{data[0]['attribute_value']}\" 在句中属性为:\"{prob_att}\",置信度为{prob:.2f}。" ) if cfg.predict_plot: plt.rcParams["font.family"] = 'Arial Unicode MS' x = list(rels.keys()) height = list(y_pred.cpu().numpy()) plt.bar(x, height) for x, y in zip(x, height): plt.text(x, y, '%.2f' % y, ha="center", va="bottom") plt.xlabel('关系') plt.ylabel('置信度') plt.xticks(rotation=315) plt.show()
def test_get_original_cwd_without_hydra(hydra_restore_singletons: Any) -> None: with pytest.raises(ValueError): utils.get_original_cwd()
def _call_children_scripts(self): assert self.trainer.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) # allow the user to pass the node rank node_rank = '0' node_rank = os.environ.get('NODE_RANK', node_rank) node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.trainer.data_parallel_device_ids is None: raise MisconfigurationException('you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)') os.environ['PL_TRAINER_GPUS'] = ','.join([str(i) for i in self.trainer.data_parallel_device_ids]) os.environ['PL_IN_DDP_SUBPROCESS'] = '1' if self.trainer.logger is not None: os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version) num_gpus = len(self.trainer.data_parallel_device_ids) os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # remove env var if global seed not set if os.environ.get('PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy: del env_copy['PL_GLOBAL_SEED'] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def main(cfg): cwd = utils.get_original_cwd() cfg.cwd = cwd print(cfg) data_path = DATA_PATH[cfg.dataset_name] for mode, path in data_path.items(): data_path[mode] = os.path.join(cfg.cwd, path) dataset_class, data_process = DATASET_CLASS[ cfg.dataset_name], DATA_PROCESS[cfg.dataset_name] mapping = MAPPING[cfg.dataset_name] set_seed(cfg.seed) # set seed, default is 1 if cfg.save_path is not None: # make save_path dir cfg.save_path = os.path.join( cfg.save_path, cfg.dataset_name + "_" + str(cfg.batch_size) + "_" + str(cfg.learning_rate) + cfg.notes) if not os.path.exists(cfg.save_path): os.makedirs(cfg.save_path, exist_ok=True) process = data_process(data_path=data_path, mapping=mapping, bart_name=cfg.bart_name, learn_weights=cfg.learn_weights) train_dataset = dataset_class(data_processor=process, mode='train') train_dataloader = DataLoader(train_dataset, collate_fn=train_dataset.collate_fn, batch_size=cfg.batch_size, num_workers=4) dev_dataset = dataset_class(data_processor=process, mode='dev') dev_dataloader = DataLoader(dev_dataset, collate_fn=dev_dataset.collate_fn, batch_size=cfg.batch_size, num_workers=4) label_ids = list(process.mapping2id.values()) prompt_model = PromptBartModel(tokenizer=process.tokenizer, label_ids=label_ids, args=cfg) model = PromptGeneratorModel(prompt_model=prompt_model, bos_token_id=0, eos_token_id=1, max_length=cfg.tgt_max_len, max_len_a=cfg.src_seq_ratio, num_beams=cfg.num_beams, do_sample=False, repetition_penalty=1, length_penalty=cfg.length_penalty, pad_token_id=1, restricter=None) metrics = Seq2SeqSpanMetric(eos_token_id=1, num_labels=len(label_ids), target_type='word') loss = get_loss trainer = Trainer(train_data=train_dataloader, dev_data=dev_dataloader, test_data=None, model=model, args=cfg, logger=logger, loss=loss, metrics=metrics, writer=writer) trainer.train() writer.close()
def main(cfg): # Use gpu or not if cfg.use_gpu and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') if cfg.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(cfg.gradient_accumulation_steps)) cfg.train_batch_size = cfg.train_batch_size // cfg.gradient_accumulation_steps random.seed(cfg.seed) np.random.seed(cfg.seed) torch.manual_seed(cfg.seed) if not cfg.do_train and not cfg.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # Checkpoints if os.path.exists(utils.get_original_cwd() + '/' + cfg.output_dir ) and os.listdir(utils.get_original_cwd() + '/' + cfg.output_dir) and cfg.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( utils.get_original_cwd() + '/' + cfg.output_dir)) if not os.path.exists(utils.get_original_cwd() + '/' + cfg.output_dir): os.makedirs(utils.get_original_cwd() + '/' + cfg.output_dir) # Preprocess the input dataset processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 # Prepare the model tokenizer = BertTokenizer.from_pretrained(cfg.bert_model, do_lower_case=cfg.do_lower_case) train_examples = None num_train_optimization_steps = 0 if cfg.do_train: train_examples = processor.get_train_examples( utils.get_original_cwd() + '/' + cfg.data_dir) num_train_optimization_steps = int( len(train_examples) / cfg.train_batch_size / cfg.gradient_accumulation_steps) * cfg.num_train_epochs config = BertConfig.from_pretrained(cfg.bert_model, num_labels=num_labels, finetuning_task=cfg.task_name) model = TrainNer.from_pretrained(cfg.bert_model, from_tf=False, config=config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': cfg.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(cfg.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=cfg.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if cfg.do_train: train_features = convert_examples_to_features(train_examples, label_list, cfg.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=cfg.train_batch_size) model.train() for _ in trange(int(cfg.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) if cfg.gradient_accumulation_steps > 1: loss = loss / cfg.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % cfg.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(utils.get_original_cwd() + '/' + cfg.output_dir) tokenizer.save_pretrained(utils.get_original_cwd() + '/' + cfg.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": cfg.bert_model, "do_lower": cfg.do_lower_case, "max_seq_length": cfg.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open( os.path.join(utils.get_original_cwd() + '/' + cfg.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(utils.get_original_cwd() + '/' + cfg.output_dir) tokenizer = BertTokenizer.from_pretrained( utils.get_original_cwd() + '/' + cfg.output_dir, do_lower_case=cfg.do_lower_case) model.to(device) if cfg.do_eval: if cfg.eval_on == "dev": eval_examples = processor.get_dev_examples( utils.get_original_cwd() + '/' + cfg.data_dir) elif cfg.eval_on == "test": eval_examples = processor.get_test_examples( utils.get_original_cwd() + '/' + cfg.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, cfg.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=cfg.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) output_eval_file = os.path.join( utils.get_original_cwd() + '/' + cfg.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(cfg: DictConfig): cwd = Path(get_original_cwd()) # overwrite config if continue training from checkpoint resume_cfg = None if "resume" in cfg: cfg_path = cwd / cfg.resume / ".hydra/config.yaml" print(f"Continue from: {cfg.resume}") # Overwrite everything except device # TODO config merger (perhaps continue training with the same optimizer but other lrs?) resume_cfg = OmegaConf.load(cfg_path) cfg.model = resume_cfg.model if cfg.train.num_epochs == 0: cfg.data.scale_factor = resume_cfg.data.scale_factor OmegaConf.save(cfg, ".hydra/config.yaml") print(OmegaConf.to_yaml(cfg)) device = set_device_id(cfg.device) set_seed(cfg.seed, device=device) # Augmentations if cfg.data.aug == "auto": transforms = albu.load(cwd / "autoalbument/autoconfig.json") else: transforms = D.get_training_augmentations() if OmegaConf.is_missing(cfg.model, "convert_bottleneck"): cfg.model.convert_bottleneck = (0, 0, 0) # Model print(f"Setup model {cfg.model.arch} {cfg.model.encoder_name} " f"convert_bn={cfg.model.convert_bn} " f"convert_bottleneck={cfg.model.convert_bottleneck} ") model = get_segmentation_model( arch=cfg.model.arch, encoder_name=cfg.model.encoder_name, encoder_weights=cfg.model.encoder_weights, classes=1, convert_bn=cfg.model.convert_bn, convert_bottleneck=cfg.model.convert_bottleneck, # decoder_attention_type="scse", # TODO to config ) model = model.to(device) model.train() print(model) # Optimization # Reduce LR for pretrained encoder layerwise_params = { "encoder*": dict(lr=cfg.optim.lr_encoder, weight_decay=cfg.optim.wd_encoder) } model_params = cutils.process_model_params( model, layerwise_params=layerwise_params) # Select optimizer optimizer = get_optimizer( name=cfg.optim.name, model_params=model_params, lr=cfg.optim.lr, wd=cfg.optim.wd, lookahead=cfg.optim.lookahead, ) criterion = { "dice": DiceLoss(), # "dice": SoftDiceLoss(mode="binary", smooth=1e-7), "iou": IoULoss(), "bce": nn.BCEWithLogitsLoss(), "lovasz": LovaszLossBinary(), "focal_tversky": FocalTverskyLoss(eps=1e-7, alpha=0.7, gamma=0.75), } # Load states if resuming training if "resume" in cfg: checkpoint_path = (cwd / cfg.resume / cfg.train.logdir / "checkpoints/best_full.pth") if checkpoint_path.exists(): print(f"\nLoading checkpoint {str(checkpoint_path)}") checkpoint = cutils.load_checkpoint(checkpoint_path) cutils.unpack_checkpoint( checkpoint=checkpoint, model=model, optimizer=optimizer if resume_cfg.optim.name == cfg.optim.name else None, criterion=criterion, ) else: raise ValueError("Nothing to resume, checkpoint missing") # We could only want to validate resume, in this case skip training routine best_th = 0.5 stats = None if cfg.data.stats: print(f"Use statistics from file: {cfg.data.stats}") stats = cwd / cfg.data.stats if cfg.train.num_epochs is not None: callbacks = [ # Each criterion is calculated separately. CriterionCallback(input_key="mask", prefix="loss_dice", criterion_key="dice"), CriterionCallback(input_key="mask", prefix="loss_iou", criterion_key="iou"), CriterionCallback(input_key="mask", prefix="loss_bce", criterion_key="bce"), CriterionCallback(input_key="mask", prefix="loss_lovasz", criterion_key="lovasz"), CriterionCallback( input_key="mask", prefix="loss_focal_tversky", criterion_key="focal_tversky", ), # And only then we aggregate everything into one loss. MetricAggregationCallback( prefix="loss", mode="weighted_sum", # can be "sum", "weighted_sum" or "mean" # because we want weighted sum, we need to add scale for each loss metrics={ "loss_dice": cfg.loss.dice, "loss_iou": cfg.loss.iou, "loss_bce": cfg.loss.bce, "loss_lovasz": cfg.loss.lovasz, "loss_focal_tversky": cfg.loss.focal_tversky, }, ), # metrics DiceCallback(input_key="mask"), IouCallback(input_key="mask"), # gradient accumulation OptimizerCallback(accumulation_steps=cfg.optim.accumulate), # early stopping SchedulerCallback(reduced_metric="loss_dice", mode=cfg.scheduler.mode), EarlyStoppingCallback(**cfg.scheduler.early_stopping, minimize=False), # TODO WandbLogger works poorly with multistage right now WandbLogger(project=cfg.project, config=dict(cfg)), # CheckpointCallback(save_n_best=cfg.checkpoint.save_n_best), ] # Training runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask") # TODO Scheduler does not work now, every stage restarts from base lr scheduler_warm_restart = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[1, 2], gamma=10, ) for i, (size, num_epochs) in enumerate( zip(cfg.data.sizes, cfg.train.num_epochs)): scale = size / 1024 print( f"Training stage {i}, scale {scale}, size {size}, epochs {num_epochs}" ) # Datasets ( train_ds, valid_ds, train_images, val_images, ) = D.get_train_valid_datasets_from_path( # path=(cwd / cfg.data.path), path=(cwd / f"data/hubmap-{size}x{size}/"), train_ids=cfg.data.train_ids, valid_ids=cfg.data.valid_ids, seed=cfg.seed, valid_split=cfg.data.valid_split, mean=cfg.data.mean, std=cfg.data.std, transforms=transforms, stats=stats, ) train_bs = int(cfg.loader.train_bs / (scale**2)) valid_bs = int(cfg.loader.valid_bs / (scale**2)) print( f"train: {len(train_ds)}; bs {train_bs}", f"valid: {len(valid_ds)}, bs {valid_bs}", ) # Data loaders data_loaders = D.get_data_loaders( train_ds=train_ds, valid_ds=valid_ds, train_bs=train_bs, valid_bs=valid_bs, num_workers=cfg.loader.num_workers, ) # Select scheduler scheduler = get_scheduler( name=cfg.scheduler.type, optimizer=optimizer, num_epochs=num_epochs * (len(data_loaders["train"]) if cfg.scheduler.mode == "batch" else 1), eta_min=scheduler_warm_restart.get_last_lr()[0] / cfg.scheduler.eta_min_factor, plateau=cfg.scheduler.plateau, ) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, logdir=cfg.train.logdir, loaders=data_loaders, num_epochs=num_epochs, verbose=True, main_metric=cfg.train.main_metric, load_best_on_end=True, minimize_metric=False, check=cfg.check, fp16=dict(amp=cfg.amp), ) # Set new initial LR for optimizer after restart scheduler_warm_restart.step() print( f"New LR for warm restart {scheduler_warm_restart.get_last_lr()[0]}" ) # Find optimal threshold for dice score model.eval() best_th, dices = find_dice_threshold(model, data_loaders["valid"]) print("Best dice threshold", best_th, np.max(dices[1])) np.save(f"dices_{size}.npy", dices) else: print("Validation only") # Datasets size = cfg.data.sizes[-1] train_ds, valid_ds = D.get_train_valid_datasets_from_path( # path=(cwd / cfg.data.path), path=(cwd / f"data/hubmap-{size}x{size}/"), train_ids=cfg.data.train_ids, valid_ids=cfg.data.valid_ids, seed=cfg.seed, valid_split=cfg.data.valid_split, mean=cfg.data.mean, std=cfg.data.std, transforms=transforms, stats=stats, ) train_bs = int(cfg.loader.train_bs / (cfg.data.scale_factor**2)) valid_bs = int(cfg.loader.valid_bs / (cfg.data.scale_factor**2)) print( f"train: {len(train_ds)}; bs {train_bs}", f"valid: {len(valid_ds)}, bs {valid_bs}", ) # Data loaders data_loaders = D.get_data_loaders( train_ds=train_ds, valid_ds=valid_ds, train_bs=train_bs, valid_bs=valid_bs, num_workers=cfg.loader.num_workers, ) # Find optimal threshold for dice score model.eval() best_th, dices = find_dice_threshold(model, data_loaders["valid"]) print("Best dice threshold", best_th, np.max(dices[1])) np.save(f"dices_val.npy", dices) # # # Load best checkpoint # checkpoint_path = Path(cfg.train.logdir) / "checkpoints/best.pth" # if checkpoint_path.exists(): # print(f"\nLoading checkpoint {str(checkpoint_path)}") # state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))[ # "model_state_dict" # ] # model.load_state_dict(state_dict) # del state_dict # model = model.to(device) # Load config for updating with threshold and metric # (otherwise loading do not work) cfg = OmegaConf.load(".hydra/config.yaml") cfg.threshold = float(best_th) # Evaluate on full-size image if valid_ids is non-empty df_train = pd.read_csv(cwd / "data/train.csv") df_train = { r["id"]: r["encoding"] for r in df_train.to_dict(orient="record") } dices = [] unique_ids = sorted( set( str(p).split("/")[-1].split("_")[0] for p in (cwd / cfg.data.path / "train").iterdir())) size = cfg.data.sizes[-1] scale = size / 1024 for image_id in cfg.data.valid_ids: image_name = unique_ids[image_id] print(f"\nValidate for {image_name}") rle_pred, shape = inference_one( image_path=(cwd / f"data/train/{image_name}.tiff"), target_path=Path("."), cfg=cfg, model=model, scale_factor=scale, tile_size=cfg.data.tile_size, tile_step=cfg.data.tile_step, threshold=best_th, save_raw=False, tta_mode=None, weight="pyramid", device=device, filter_crops="tissue", stats=stats, ) print("Predict", shape) pred = rle_decode(rle_pred["predicted"], shape) mask = rle_decode(df_train[image_name], shape) assert pred.shape == mask.shape, f"pred {pred.shape}, mask {mask.shape}" assert pred.shape == shape, f"pred {pred.shape}, expected {shape}" dices.append( dice( torch.from_numpy(pred).type(torch.uint8), torch.from_numpy(mask).type(torch.uint8), threshold=None, activation="none", )) print("Full image dice:", np.mean(dices)) OmegaConf.save(cfg, ".hydra/config.yaml") return
def test_get_original_cwd(): orig = "/foo/bar" cfg = OmegaConf.create({"hydra": {"runtime": {"cwd": orig}}}) HydraConfig().set_config(cfg) assert utils.get_original_cwd() == orig
def __init__(self, hydra_cfg, logger): self.logger = logger self.hydra_cfg = hydra_cfg self.seed = hydra_cfg['parameters']['seed'] self.metric = hydra_cfg['parameters']['metric'] self.device = torch.device( 'cuda:{}'.format(hydra_cfg['parameters']['gpu_id'] ) if torch.cuda.is_available() else 'cpu') working_dir = utils.get_original_cwd() + '/' training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[ 'dataset']['train_fname'] is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0 # load embeddings pretrained_path = hydra_cfg['parameters']['pre_trained'] pretrained_vocab = {} if pretrained_path: pretrained_path = working_dir + hydra_cfg['parameters'][ 'pre_trained'] self.logger.info('Loading pre-trained word embeddings {}\n'.format( pretrained_path)) pretrained_w2v = KeyedVectors.load_word2vec_format( fname=pretrained_path) pretrained_vocab = set(pretrained_w2v.vocab.keys()) assert hydra_cfg['parameters']['ngram'] == 1 self.dictionary = SupervisedDictionary( replace_OOV_word=is_replaced_OOV, min_count=hydra_cfg['parameters']['min_count'], replace_word='<OOV>', size_word_n_gram=hydra_cfg['parameters']['ngram'], word_n_gram_min_count=hydra_cfg['parameters'] ['word_n_gram_min_count'], label_separator=hydra_cfg['parameters']['label_separator'], line_break_word='') self.logger.info('Use {}\n'.format(self.device)) self.dictionary.fit(training_path) if pretrained_vocab: self.dictionary.update_vocab_from_word_set(pretrained_vocab) self.train_set, self.val_set = get_datasets( cfg=hydra_cfg, dictionary=self.dictionary, working_dir=working_dir, training_path=training_path, include_test=False) pretrained_word_vectors = None dim = self.hydra_cfg['parameters']['dim'] self.pooling = self.hydra_cfg['parameters']['pooling'] OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov'] self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0 if pretrained_word_vectors: pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings( pretrained_w2v, self.dictionary, OOV_initialized_method, rnd=np.random.RandomState(self.seed)) dim = pretrained_word_vectors.shape[1] self.pretrained_word_vectors = pretrained_word_vectors self.dim = dim self.logger.info('#training_data: {}, #val_data: {}\n'.format( len(self.train_set), len(self.val_set))) self.logger.info( 'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n' .format(self.dictionary.size_word_vocab, self.dictionary.size_ngram_vocab, self.dictionary.size_total_vocab))
def main(cfg): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) stream_handler.terminator = '' logger.addHandler(stream_handler) check_hydra_conf(cfg) metric = cfg['parameters']['metric'] if metric == 'loss': direction = 'minimize' else: direction = 'maximize' sampler = TPESampler(seed=cfg['parameters']['seed'], n_startup_trials=2) pruner = optuna.pruners.HyperbandPruner( max_resource=cfg['parameters']['epochs']) study = optuna.create_study(direction=direction, sampler=sampler, pruner=pruner) objective = Objective(cfg, logger) study.optimize(objective, n_trials=cfg['optuna']['num_trials'], n_jobs=cfg['optuna']['n_jobs']) # logging_file trial = study.best_trial logger.info('\nVal. loss: {:.4f}, Val acc.: {:.1f}%'.format( trial.user_attrs['val_loss'], trial.user_attrs['val_acc'] * 100)) for key, value in trial.params.items(): logger.info(' {}: {}'.format(key, value)) # remove poor models target = Path(trial.user_attrs['model_path']) for path in target.parent.glob('*.pt'): if path != target: path.unlink() # evaluation # load test data loader working_dir = utils.get_original_cwd() + '/' test_set = SentenceDataset( *objective.dictionary.transform(working_dir + cfg['dataset']['path'] + cfg['dataset']['test_fname']), objective.dictionary.size_word_vocab, train=False) test_data_loader = torch.utils.data.dataloader.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=1) device = objective.device # init model model = SupervisedFastText(V=objective.dictionary.size_total_vocab, num_classes=len( objective.dictionary.label_vocab), embedding_dim=objective.dim, pretrained_emb=None, freeze=True, pooling=objective.pooling).to(device) # load model model.load_state_dict(torch.load(target, map_location=device)) model = model.to(device) loss, acc = evaluation(model, device, test_data_loader, divide_by_num_data=True) results = trial.user_attrs results['test_loss'] = loss results['test_acc'] = acc output_path_fname = os.getcwd() + '/' + cfg['parameters']['logging_file'] logger.info('Saving training history and evaluation scores in {}'.format( output_path_fname)) with open(output_path_fname, 'w') as log_file: json.dump(results, log_file)
def main(cfg): logger.info("=" * 20) # Current Working Directory cwd = utils.get_original_cwd() cfg.cwd = cwd # Model dictionary __MODEL__ = { 'transformer': models.Transformer, 'rnn': models.RNN, 'cnn': models.CNN, 'bert': models.Bert } # Device if cfg.use_GPU and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') logger.info("Device: {}".format(device)) # Preprocess raw data if cfg.preprocess: logger.info("Preprocessing...") preprocess(cfg) # Once is quite enough # Make dataset logger.info("Making Dataset...") train_dataset = makeDataset(cfg, train_flag=True) # len: 100000 test_dataset = makeDataset(cfg, train_flag=False) # len: 25000 # Model logger.info("Model:{}".format(cfg.model_name)) model = __MODEL__[cfg.model_name](cfg).to(device) optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay) criterion = nn.CrossEntropyLoss() # Train if not cfg.test_only: logger.info("=" * 5 + " Start %d-fold training! " % cfg.k_fold + "=" * 5) k_fold_trainer = KFoldTrainer(cfg, train_dataset, model, device, optimizer, criterion) best_epoch, best_acc_v = -1, -1 for epoch in range(cfg.EPOCH): k_fold_trainer.set_epoch(epoch) loss_t, acc_t, loss_v, acc_v = k_fold_trainer.kFoldTrain() if (acc_v > best_acc_v): best_epoch = epoch best_acc_v = acc_v torch.save( model, os.path.join( cwd, "{}model_{}.pt".format(cfg.model_dir, cfg.model_name))) logger.info("Model Updated!") logger.info( "Current Epoch(%d):\nTrain loss: %.6f Train accuracy: %.2f%% Valid loss: %.6f Valid Accuracy: %.2f%%" % (epoch, loss_t, acc_t * 100, loss_v, acc_v * 100)) logger.info("Best Epoch(%d):\n Accuracy: %.2f%%" % (best_epoch, best_acc_v * 100)) logger.info("=" * 5 + " Training finished. " + "=" * 5) # Test logger.info("=" * 5 + " Start testing! " + "=" * 5) tester = Tester(cfg, test_dataset, model, device) tester.test() logger.info("Saving results to {}. ".format( os.path.join(cwd, "{}".format(cfg.result_file)))) logger.info("=" * 5 + " Test finished. " + "=" * 5)
def test_get_original_cwd() -> None: orig = "/foo/bar" cfg = OmegaConf.create({"hydra": {"runtime": {"cwd": orig}}}) assert isinstance(cfg, DictConfig) HydraConfig().set_config(cfg) assert utils.get_original_cwd() == orig
def setup(config, device, collate_fn=collate_fn_default): """Setup for training Args: config (dict): configuration for training device (torch.device): device to use for training collate_fn (callable, optional): collate function. Defaults to collate_fn_default. Returns: (tuple): tuple containing model, optimizer, learning rate scheduler, data loaders, tensorboard writer, logger, and scalers. """ logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) logger.info(f"PyTorch version: {torch.__version__}") if torch.cuda.is_available(): from torch.backends import cudnn cudnn.benchmark = config.train.cudnn.benchmark cudnn.deterministic = config.train.cudnn.deterministic logger.info(f"cudnn.deterministic: {cudnn.deterministic}") logger.info(f"cudnn.benchmark: {cudnn.benchmark}") if torch.backends.cudnn.version() is not None: logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Random seed: {config.seed}") init_seed(config.seed) if config.train.use_detect_anomaly: torch.autograd.set_detect_anomaly(True) logger.info("Set to use torch.autograd.detect_anomaly") if "use_amp" in config.train and config.train.use_amp: logger.info("Use mixed precision training") grad_scaler = GradScaler() else: grad_scaler = None # Model model = hydra.utils.instantiate(config.model.netG).to(device) logger.info("Number of trainable params: {:.3f} million".format( num_trainable_params(model) / 1000000.0)) logger.info(model) # Optimizer optimizer_class = getattr(optim, config.train.optim.optimizer.name) optimizer = optimizer_class(model.parameters(), **config.train.optim.optimizer.params) # Scheduler lr_scheduler_class = getattr(optim.lr_scheduler, config.train.optim.lr_scheduler.name) lr_scheduler = lr_scheduler_class(optimizer, **config.train.optim.lr_scheduler.params) # DataLoader data_loaders = get_data_loaders(config.data, collate_fn, logger) set_epochs_based_on_max_steps_(config.train, len(data_loaders["train_no_dev"]), logger) # Resume if (config.train.resume.checkpoint is not None and len(config.train.resume.checkpoint) > 0): logger.info("Load weights from %s", config.train.resume.checkpoint) checkpoint = torch.load( to_absolute_path(config.train.resume.checkpoint)) model.load_state_dict(checkpoint["state_dict"]) if config.train.resume.load_optimizer: logger.info("Load optimizer state") optimizer.load_state_dict(checkpoint["optimizer_state"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"]) if config.data_parallel: model = nn.DataParallel(model) # Mlflow if config.mlflow.enabled: mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns") mlflow.set_experiment(config.mlflow.experiment) # NOTE: disable tensorboard if mlflow is enabled writer = None logger.info("Using mlflow instead of tensorboard") else: # Tensorboard writer = SummaryWriter(to_absolute_path(config.train.log_dir)) # Scalers if "in_scaler_path" in config.data and config.data.in_scaler_path is not None: in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path)) in_scaler = MinMaxScaler(in_scaler.min_, in_scaler.scale_, in_scaler.data_min_, in_scaler.data_max_) else: in_scaler = None if "out_scaler_path" in config.data and config.data.out_scaler_path is not None: out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path)) out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_, out_scaler.scale_) else: out_scaler = None return ( model, optimizer, lr_scheduler, grad_scaler, data_loaders, writer, logger, in_scaler, out_scaler, )
def test_get_original_cwd(hydra_restore_singletons: Any) -> None: orig = "/foo/AClass" cfg = OmegaConf.create({"hydra": HydraConf(runtime=RuntimeConf(cwd=orig))}) assert isinstance(cfg, DictConfig) HydraConfig.instance().set_config(cfg) assert utils.get_original_cwd() == orig
def setup_cyclegan(config, device, collate_fn=collate_fn_default): """Setup for training CycleGAN Args: config (dict): configuration for training device (torch.device): device to use for training collate_fn (callable, optional): collate function. Defaults to collate_fn_default. Returns: (tuple): tuple containing model, optimizer, learning rate scheduler, data loaders, tensorboard writer, logger, and scalers. """ logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) logger.info(f"PyTorch version: {torch.__version__}") if torch.cuda.is_available(): from torch.backends import cudnn cudnn.benchmark = config.train.cudnn.benchmark cudnn.deterministic = config.train.cudnn.deterministic logger.info(f"cudnn.deterministic: {cudnn.deterministic}") logger.info(f"cudnn.benchmark: {cudnn.benchmark}") if torch.backends.cudnn.version() is not None: logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Random seed: {config.seed}") init_seed(config.seed) if config.train.use_detect_anomaly: torch.autograd.set_detect_anomaly(True) logger.info("Set to use torch.autograd.detect_anomaly") if "use_amp" in config.train and config.train.use_amp: logger.info("Use mixed precision training") grad_scaler = GradScaler() else: grad_scaler = None # Model G netG_A2B = hydra.utils.instantiate(config.model.netG).to(device) netG_B2A = hydra.utils.instantiate(config.model.netG).to(device) logger.info( "[Generator] Number of trainable params: {:.3f} million".format( num_trainable_params(netG_A2B) / 1000000.0)) logger.info(netG_A2B) # Optimizer and LR scheduler for G optG, schedulerG = _instantiate_optim_cyclegan(config.train.optim.netG, netG_A2B, netG_B2A) # Model D netD_A = hydra.utils.instantiate(config.model.netD).to(device) netD_B = hydra.utils.instantiate(config.model.netD).to(device) logger.info( "[Discriminator] Number of trainable params: {:.3f} million".format( num_trainable_params(netD_A) / 1000000.0)) logger.info(netD_A) # Optimizer and LR scheduler for D optD, schedulerD = _instantiate_optim_cyclegan(config.train.optim.netD, netD_A, netD_B) # DataLoader data_loaders = get_data_loaders(config.data, collate_fn, logger) set_epochs_based_on_max_steps_(config.train, len(data_loaders["train_no_dev"]), logger) # Resume # TODO # _resume(logger, config.train.resume.netG, netG, optG, schedulerG) # _resume(logger, config.train.resume.netD, netD, optD, schedulerD) if config.data_parallel: netG_A2B = nn.DataParallel(netG_A2B) netG_B2A = nn.DataParallel(netG_B2A) netD_A = nn.DataParallel(netD_A) netD_B = nn.DataParallel(netD_B) # Mlflow if config.mlflow.enabled: mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns") mlflow.set_experiment(config.mlflow.experiment) # NOTE: disable tensorboard if mlflow is enabled writer = None logger.info("Using mlflow instead of tensorboard") else: # Tensorboard writer = SummaryWriter(to_absolute_path(config.train.log_dir)) # Scalers if "in_scaler_path" in config.data and config.data.in_scaler_path is not None: in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path)) if isinstance(in_scaler, SKMinMaxScaler): in_scaler = MinMaxScaler( in_scaler.min_, in_scaler.scale_, in_scaler.data_min_, in_scaler.data_max_, ) else: in_scaler = None if "out_scaler_path" in config.data and config.data.out_scaler_path is not None: out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path)) out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_, out_scaler.scale_) else: out_scaler = None return ( (netG_A2B, netG_B2A, optG, schedulerG), (netD_A, netD_B, optD, schedulerD), grad_scaler, data_loaders, writer, logger, in_scaler, out_scaler, )
def main(cfg): cwd = utils.get_original_cwd() cfg.cwd = cwd cfg.pos_size = 2 * cfg.pos_limit + 2 logger.info(f'\n{cfg.pretty()}') __Model__ = { 'cnn': models.PCNN, 'rnn': models.BiLSTM, 'transformer': models.Transformer, 'gcn': models.GCN, 'capsule': models.Capsule, 'lm': models.LM, } # device if cfg.use_gpu and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') logger.info(f'device: {device}') # 如果不修改预处理的过程,这一步最好注释掉,不用每次运行都预处理数据一次 if cfg.preprocess: preprocess(cfg) train_data_path = os.path.join(cfg.cwd, cfg.out_path, 'train.pkl') valid_data_path = os.path.join(cfg.cwd, cfg.out_path, 'valid.pkl') test_data_path = os.path.join(cfg.cwd, cfg.out_path, 'test.pkl') vocab_path = os.path.join(cfg.cwd, cfg.out_path, 'vocab.pkl') if cfg.model_name == 'lm': vocab_size = None else: vocab = load_pkl(vocab_path) vocab_size = vocab.count cfg.vocab_size = vocab_size train_dataset = CustomDataset(train_data_path) valid_dataset = CustomDataset(valid_data_path) test_dataset = CustomDataset(test_data_path) train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) valid_dataloader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) test_dataloader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) model = __Model__[cfg.model_name](cfg) model.to(device) logger.info(f'\n {model}') optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=cfg.lr_factor, patience=cfg.lr_patience) criterion = nn.CrossEntropyLoss() best_f1, best_epoch = -1, 0 es_loss, es_f1, es_epoch, es_patience, best_es_epoch, best_es_f1, es_path, best_es_path = 1e8, -1, 0, 0, 0, -1, '', '' train_losses, valid_losses = [], [] if cfg.show_plot and cfg.plot_utils == 'tensorboard': writer = SummaryWriter('tensorboard') else: writer = None logger.info('=' * 10 + ' Start training ' + '=' * 10) for epoch in range(1, cfg.epoch + 1): manual_seed(cfg.seed + epoch) train_loss = train(epoch, model, train_dataloader, optimizer, criterion, device, writer, cfg) valid_f1, valid_loss = validate(epoch, model, valid_dataloader, criterion, device, cfg) scheduler.step(valid_loss) model_path = model.save(epoch, cfg) # logger.info(model_path) train_losses.append(train_loss) valid_losses.append(valid_loss) if best_f1 < valid_f1: best_f1 = valid_f1 best_epoch = epoch # 使用 valid loss 做 early stopping 的判断标准 if es_loss > valid_loss: es_loss = valid_loss es_f1 = valid_f1 es_epoch = epoch es_patience = 0 es_path = model_path else: es_patience += 1 if es_patience >= cfg.early_stopping_patience: best_es_epoch = es_epoch best_es_f1 = es_f1 best_es_path = es_path if cfg.show_plot: if cfg.plot_utils == 'matplot': plt.plot(train_losses, 'x-') plt.plot(valid_losses, '+-') plt.legend(['train', 'valid']) plt.title('train/valid comparison loss') plt.show() if cfg.plot_utils == 'tensorboard': for i in range(len(train_losses)): writer.add_scalars('train/valid_comparison_loss', { 'train': train_losses[i], 'valid': valid_losses[i] }, i) writer.close() logger.info( f'best(valid loss quota) early stopping epoch: {best_es_epoch}, ' f'this epoch macro f1: {best_es_f1:0.4f}') logger.info(f'this model save path: {best_es_path}') logger.info( f'total {cfg.epoch} epochs, best(valid macro f1) epoch: {best_epoch}, ' f'this epoch macro f1: {best_f1:.4f}') validate(-1, model, test_dataloader, criterion, device, cfg)
def train(cfg): logger = get_logger("./log/") # lossutil = LossUtil("./log/") logger.info(cfg) ROOT = Path(utils.get_original_cwd()).parent print(ROOT) data_path = ROOT.joinpath("datasets/hsimg/data.tiff") himg = imread(str(data_path)) himg = himg / 2**16 # 分光画像の範囲を光源の範囲に制限 himg = himg[:, :, :44] himg = np.where(himg < 0, 0, himg) nhimg = np.empty((512, 512, himg.shape[2])) for i in range(himg.shape[2]): logger.info("resize %d channel..." % i) ch = himg[:, :, i] nhimg[:, :, i] = cv2.resize(255 * ch, (512, 512), interpolation=cv2.INTER_LANCZOS4) logger.info("resize %d channel... done!" % i) himg = nhimg / 255 device = 'cuda' if torch.cuda.is_available() else 'cpu' if cfg.image.type == "denoise": sigma = 0.1 himg = get_noisy_img(himg, sigma) mask = None elif "inpaint" in cfg.image.type: mask = make_mask_himg(cfg, himg) logger.info(mask.dtype) mask = torch.from_numpy(mask) mask = mask[None, :].permute(0, 3, 1, 2).to(device) logger.info(mask.shape) logger.info(mask.dtype) else: raise NotImplementedError('[%s] is not Implemented' % cfg.image.type) transform = transforms.Compose([transforms.ToTensor()]) himg = transform(himg) himg = himg[None, :].float().to(device) dist_path = ROOT.joinpath("datasets/csvs/D65.csv") cmf_path = ROOT.joinpath("datasets/csvs/CIE1964-10deg-XYZ.csv") tiff2rgb = Tiff2rgb(dist_path, cmf_path) # ターゲット画像を保存 if cfg.image.type == "denoise": img = tiff2rgb.tiff_to_rgb(himg[0].permute(1, 2, 0)) else: tmp = himg * mask logger.info(tmp.shape) logger.info(tmp.dtype) img = tiff2rgb.tiff_to_rgb(tmp[0].permute(1, 2, 0)) result_dir = Path("./result_imgs/") if not result_dir.exists(): Path(result_dir).mkdir(parents=True) img.save(result_dir.joinpath("target.png")) logger.info(himg.shape) logger.info(himg.dtype) np.random.seed(cfg.base_options.seed) torch.manual_seed(cfg.base_options.seed) torch.cuda.manual_seed_all(cfg.base_options.seed) torch.backends.cudnn.benchmark = False model = DIP(cfg, himg, mask) logger.info(model.generator) if cfg.base_options.debugging: summary(model.generator, (1, 512, 512)) # summary(model.generator, (3, 256, 256)) print('デバッグ中 途中で終了します!!!') exit(0) input_noise = torch.randn(1, 1, himg.shape[2], himg.shape[3], device=device) logger.info(input_noise.dtype) for epoch in range(cfg.base_options.epochs): if epoch % cfg.base_options.print_freq == 0: epoch_start_time = time.time() if not cfg.base_options.do_fix_noise: input_noise = torch.randn(1, 1, himg.shape[2], himg.shape[3], dtype=torch.float32, device=device) model.forward(input_noise) # logger.info(model.gimg.shape) # logger.info(model.gimg.dtype) model.optimize_parameters() # model.update_learning_rate() losses = model.get_current_losses() if epoch % cfg.base_options.print_freq == 0: num = cfg.base_options.print_freq t1 = (time.time() - epoch_start_time) t2 = float(t1) / num print("%dエポックの所要時間%.3f 平均時間%.3f" % (num, t1, t2)) print_losses(epoch, losses) if cfg.base_options.save_model_freq != -1 and epoch % cfg.base_options.save_model_freq == 0: model.save_networks(epoch) model.save_losses() if epoch % cfg.base_options.save_img_freq == 0: new_img = model.gimg.detach()[0] # CHW -> HWC new_img = new_img.permute(1, 2, 0) # print(new_img.shape, new_img.device, new_img.dtype) img = tiff2rgb.tiff_to_rgb(new_img) img.save(result_dir.joinpath("%05d.png" % epoch)) if cfg.base_options.save_model_freq != -1: model.save_networks("finish") model.save_losses() # 実験結果の動画 freq = cfg.base_options.save_img_freq epochs = cfg.base_options.epochs result_imgs = [ "./result_imgs/%05d.png" % (epoch) for epoch in range(epochs) if epoch % freq == 0 ] make_video("./", result_imgs, width=himg.shape[3], height=himg.shape[3])
def main(cfg): cwd = utils.get_original_cwd() cfg.cwd = cwd cfg.pos_size = 2 * cfg.pos_limit + 2 print(cfg.pretty()) with open('res_sample0227_v1.csv','w') as csvf: writer = csv.writer(csvf) writer.writerow(["sentence","relation","head","head_offset","tail","tail_offset","prob","stock_name","stock_code","industry_code"]) path = '/data/prj2020/EnterpriseSpider/news/enty0126_all.csv' # enty_df = pd.read_csv(path).iloc[33376:,:] enty_df = pd.read_csv(path) all_data = [] all_rels = [] all_pred = [] all_prob = [] oom_times = 0 for index, row in enty_df.iterrows(): # get predict instance print(row) instance= { 'sentence':row['sentence'], 'head':row['head'], 'tail':row['tail'], 'head_type':'企业', 'tail_type':'企业' } data = [instance] # preprocess data data, rels = _preprocess_data(data, cfg) # model __Model__ = { 'cnn': models.PCNN, 'rnn': models.BiLSTM, 'transformer': models.Transformer, 'gcn': models.GCN, 'capsule': models.Capsule, 'lm': models.LM, } # 最好在 cpu 上预测 # cfg.use_gpu = False if cfg.use_gpu and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') # logger.info(f'device: {device}') model = __Model__[cfg.model_name](cfg) # logger.info(f'model name: {cfg.model_name}') # logger.info(f'\n {model}') model.load(fp, device=device) model.to(device) model.eval() all_data.append(data) all_rels.append(rels) x = dict() x['word'], x['lens'] = torch.tensor([data[0]['token2idx']]), torch.tensor([data[0]['seq_len']]) if cfg.model_name != 'lm': x['head_pos'], x['tail_pos'] = torch.tensor([data[0]['head_pos']]), torch.tensor([data[0]['tail_pos']]) if cfg.model_name == 'cnn': if cfg.use_pcnn: x['pcnn_mask'] = torch.tensor([data[0]['entities_pos']]) for key in x.keys(): x[key] = x[key].to(device) try: with torch.no_grad(): y_pred = model(x) y_pred = torch.softmax(y_pred, dim=-1)[0] prob = y_pred.max().item() prob_rel = list(rels.keys())[y_pred.argmax().item()] all_pred.append(prob_rel) all_prob.append(prob) logger.info(f"\"{data[0]['head']}\" 和 \"{data[0]['tail']}\" 在句中关系为:\"{prob_rel}\",置信度为{prob:.2f}。") with open('res_sample0227_v1.csv','a+') as csvf: writer = csv.writer(csvf) writer.writerow([row['sentence'],prob_rel,row['head'],row['head_offset'],row['tail'],row['tail_offset'],prob,row['stock_name'],row['stock_code'],row['industry_code']]) except RuntimeError as exception: if "out of memory" in str(exception): oom_times += 1 logger.info("WARNING:ran out of memory,times:{}".format(oom_times)) if hasattr(torch.cuda,'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception all_res = { 'sentence':enty_df['sentence'], 'relation':all_pred, 'head':enty_df['head'], 'head_pos':enty_df['head_offset'], 'tail':enty_df['tail'], 'tail_pos':enty_df['tail_offset'], 'prob':all_prob, "stock_name":enty_df['stock_name'], "stock_code":enty_df['stock_code'], "industry_code":enty_df['industry_code'] } all_res_df = pd.DataFrame(all_res) all_res_df.to_csv('res_sample0227_v101.csv',index = False,header=1)
def train(dataset_cfg, model_cfg, training_cfg): image_root_folder = os.path.join(utils.get_original_cwd(), dataset_cfg.image_root_folder) img_transform = transforms.Compose([ transforms.Resize((dataset_cfg.img_size, dataset_cfg.img_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = CustomImageFolder(image_root_folder, transform=img_transform, sample_size=dataset_cfg.sample_size) model = models.__dict__[model_cfg.name]( num_classes=training_cfg.n_clusters, initialize=model_cfg.initialize) model, already_trained_epoch = checkpoint_utils.load_latest_checkpoint( model, training_cfg.checkpoint, use_gpu) if use_gpu: model = model.cuda() deep_kmeans = DeepKmeans(n_clusters=training_cfg.n_clusters) model.train() criterion = nn.CrossEntropyLoss() if training_cfg.optimizer.name == 'sgd': optimizer = torch.optim.SGD(model.module.parameters(), lr=training_cfg.optimizer.lr, momentum=training_cfg.optimizer.momentum, weight_decay=10**training_cfg.optimizer.wd) else: optimizer = torch.optim.Adam( model.parameters(), lr=training_cfg.optimizer.lr, weight_decay=10**training_cfg.optimizer.wd) losses = AverageMeter() os.makedirs(os.path.dirname(training_cfg.log_file), exist_ok=True) for epoch in range(already_trained_epoch, training_cfg.num_epochs): features = extract_features(model, dataset, batch_size=training_cfg.batch_size) features = apply_dimensionality_reduction( features, pca_components=training_cfg.pca.component_size) pseudo_labels, kmeans_loss, nmi_previous = deep_kmeans.cluster( features) if not training_cfg.use_original_labels: dataset.set_pseudo_labels(pseudo_labels) nmi_orj = normalized_mutual_info_score(dataset.ori_labels, dataset.targets) logger.info('NMI against original assignment: {0:.3f}'.format(nmi_orj)) acc, informational_acc, category_mapping = calculate_accuracy( dataset.ori_labels, dataset.targets) logger.info('Classification Acc:%s\tInformational Acc:%s\n' % (acc, informational_acc)) if training_cfg.reinitialize: logger.debug('Reinitializing FC') model.reinitialize_fc() optimizer_tl = torch.optim.SGD( model.fc.parameters(), lr=training_cfg.optimizer.lr, momentum=training_cfg.optimizer.momentum, weight_decay=10**training_cfg.optimizer.wd, ) if use_gpu: model.cuda() model.train() sampler = UnifLabelSampler(N=int(len(dataset) * training_cfg.reassign), images_lists=dataset.targets, cluster_size=training_cfg.n_clusters) dataloader = DataLoader(dataset, batch_size=training_cfg.batch_size, shuffle=False, num_workers=4, drop_last=False, sampler=sampler) logger.info('Epoch [{}/{}] started'.format(epoch, training_cfg.num_epochs)) for data in tqdm(dataloader, total=int( len(dataset) * training_cfg.reassign / training_cfg.batch_size)): img, y, _ = data if use_gpu: img = Variable(img).cuda(non_blocking=True) y = y.cuda(non_blocking=True) # ===================forward===================== y_hat = model(img) loss = criterion(y_hat, y) # record loss losses.add({'loss_%s' % epoch: loss.item() / img.size(0)}) # ===================backward==================== optimizer.zero_grad() optimizer_tl.zero_grad() loss.backward() optimizer.step() optimizer_tl.step() # ===================log======================== correct = 0 total = 0 with torch.no_grad(): dataloader = DataLoader(dataset, batch_size=training_cfg.batch_size, shuffle=False, num_workers=4, drop_last=True) for data in dataloader: img, y, _ = data if use_gpu: img = Variable(img).cuda(non_blocking=True) y = y.cuda(non_blocking=True) y_hat = model(img) _, predicted = torch.max(y_hat.data, 1) total += y.size(0) correct += (predicted == y).sum().item() log = 'Epoch [%s/%s],\tLoss:%s,\tKmeans loss:%s\t' \ 'Acc:%s\tInformational acc:%s\tNetwork Acc:%s\tNMI Orj:%s\tNMI Previous:%s\n' % ( epoch, training_cfg.num_epochs, losses.get('loss_%s' % epoch), kmeans_loss, acc, informational_acc, (100 * correct / total), nmi_orj, nmi_previous) logger.info(log) with open(training_cfg.log_file, mode='a') as f: f.write(log) if epoch % 5 == 0: checkpoint_utils.save_checkpoint(model, training_cfg.checkpoint, epoch) if use_gpu: torch.cuda.empty_cache()
def _call_children_scripts(self): # bookkeeping of spawned processes self._check_can_spawn_children() # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.main_address os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover # Script called as `python a/b/c.py` # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name ] + sys.argv[1:] # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.parallel_devices is None: raise MisconfigurationException( "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)" ) os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" self.interactive_ddp_procs = [] for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" # remove env var if global seed not set if os.environ.get( "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [ f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}" ] proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) self._rank_0_has_called_call_children_scripts = True
def main(cfg: DictConfig) -> None: print("Params: \n") print(OmegaConf.to_yaml(cfg)) time.sleep(10) best_acc = 0 start_epoch = 0 working_dir = os.path.join(get_original_cwd(), cfg.output_dir, cfg.train_id) os.makedirs(working_dir, exist_ok=True) writer = SummaryWriter(working_dir) # Setup data. # -------------------- print('=> Preparing data..') trainloader, testloader = utils.get_dataloaders( dataset=cfg.dataset.name, batch_size=cfg.dataset.batch_size, data_root=cfg.dataset.data_root) net = setup_network(cfg.dataset.name, cfg.dataset.arch) net = tweak_network(net, bit=cfg.quantizer.bit, train_conf=cfg.train_conf, quant_mode=cfg.quant_mode, arch=cfg.dataset.arch, cfg=cfg) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True print(net) print("Number of learnable parameters: ", sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6, "M") time.sleep(5) load_checkpoint(net, init_from=cfg.dataset.init_from) params = create_train_params(model=net, main_wd=cfg.quantizer.wd, delta_wd=0, skip_keys=['.delta', '.alpha'], verbose=cfg.verbose) criterion = nn.CrossEntropyLoss() # Setup optimizer # ---------------------------- if cfg.quantizer.optimizer == 'sgd': print("=> Use SGD optimizer") optimizer = optim.SGD(params, lr=cfg.quantizer.lr, momentum=0.9, weight_decay=cfg.quantizer.wd) elif cfg.quantizer.optimizer == 'adam': print("=> Use Adam optimizer") optimizer = optim.Adam(params, lr=cfg.quantizer.lr, weight_decay=cfg.quantizer.wd) lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=cfg.dataset.epochs) if cfg.evaluate: print("==> Start evaluating ...") test(net, testloader, criterion, -1) exit() # ----------------------------------------------- # Reset to 'warmup_lr' if we are using warmup strategy. if cfg.quantizer.enable_warmup: assert cfg.quantizer.bit == 1 for param_group in optimizer.param_groups: param_group['lr'] = cfg.quantizer.warmup_lr # Initialization # ------------------------------------------------ if cfg.quantizer.bit != 32 and "quan" in cfg.train_conf: simple_initialization(net, trainloader, num_batches=cfg.dataset.num_calibration_batches, train_conf=cfg.train_conf) # Training # ----------------------------------------------- save_checkpoint_epochs = list(range(10)) for epoch in range(start_epoch, cfg.dataset.epochs): train_loss, train_acc1 = train(net, optimizer, trainloader, criterion, epoch, cfg=cfg) test_loss, test_acc1, curr_acc = test(net, testloader, criterion, epoch) # Save checkpoint. if curr_acc > best_acc: best_acc = curr_acc utils.save_checkpoint(net, lr_scheduler, optimizer, curr_acc, epoch, filename=os.path.join( working_dir, 'ckpt_best.pth')) print('Saving..') print('Best accuracy: ', best_acc) if lr_scheduler is not None: lr_scheduler.step() write_metrics(writer, epoch, net, \ optimizer, train_loss, train_acc1, test_loss, test_acc1, prefix="Standard_Training") print('Best accuracy: ', best_acc)
def spawn_ddp_children(self, model): port = os.environ['MASTER_PORT'] master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR'] os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank node_rank = '0' if 'NODE_RANK' in os.environ: node_rank = os.environ['NODE_RANK'] if 'GROUP_RANK' in os.environ: node_rank = os.environ['GROUP_RANK'] os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # since this script sets the visible devices we replace the gpus flag with a number num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__() if '--gpus' in command: gpu_flag_idx = command.index('--gpus') command[gpu_flag_idx + 1] = f'{num_gpus}' os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.trainer.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) local_rank = 0 results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True) del os.environ['WORLD_SIZE'] return results
def my_app(_: DictConfig) -> None: run_dir = str(Path.cwd().relative_to(get_original_cwd())) time.sleep(2) run_dir_after_sleep = str(Path(HydraConfig.get().run.dir)) assert run_dir == run_dir_after_sleep
def main(cfg): cwd = get_original_cwd() os.chdir(cwd) if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"): get_label_word(cfg) if not os.path.exists(cfg.data_dir): generate_k_shot(cfg.data_dir) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') data = REDataset(cfg) data_config = data.get_data_config() config = AutoConfig.from_pretrained(cfg.model_name_or_path) config.num_labels = data_config["num_labels"] model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path, config=config) # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count()))) model.to(device) lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer) data.setup() if cfg.train_from_saved_model != '': model.load_state_dict( torch.load(cfg.train_from_saved_model)["checkpoint"]) print("load saved model from {}.".format(cfg.train_from_saved_model)) lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"] #data.tokenizer.save_pretrained('test') optimizer = lit_model.configure_optimizers() if cfg.train_from_saved_model != '': optimizer.load_state_dict( torch.load(cfg.train_from_saved_model)["optimizer"]) print("load saved optimizer from {}.".format( cfg.train_from_saved_model)) num_training_steps = len(data.train_dataloader( )) // cfg.gradient_accumulation_steps * cfg.num_train_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_training_steps * 0.1, num_training_steps=num_training_steps) log_step = 100 logging(cfg.log_dir, '-' * 89, print_=False) logging(cfg.log_dir, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' INFO : START TO TRAIN ', print_=False) logging(cfg.log_dir, '-' * 89, print_=False) for epoch in range(cfg.num_train_epochs): model.train() num_batch = len(data.train_dataloader()) total_loss = 0 log_loss = 0 for index, train_batch in enumerate(tqdm(data.train_dataloader())): loss = lit_model.training_step(train_batch, index) total_loss += loss.item() log_loss += loss.item() loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() if log_step > 0 and (index + 1) % log_step == 0: cur_loss = log_loss / log_step logging( cfg.log_dir, '| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'. format(epoch, (index + 1), scheduler.get_last_lr(), cur_loss * 1000), print_=False) log_loss = 0 avrg_loss = total_loss / num_batch logging( cfg.log_dir, '| epoch {:2d} | train loss {:5.3f}'.format( epoch, avrg_loss * 1000)) model.eval() with torch.no_grad(): val_loss = [] for val_index, val_batch in enumerate(tqdm(data.val_dataloader())): loss = lit_model.validation_step(val_batch, val_index) val_loss.append(loss) f1, best, best_f1 = lit_model.validation_epoch_end(val_loss) logging(cfg.log_dir, '-' * 89) logging(cfg.log_dir, '| epoch {:2d} | dev_result: {}'.format(epoch, f1)) logging(cfg.log_dir, '-' * 89) logging(cfg.log_dir, '| best_f1: {}'.format(best_f1)) logging(cfg.log_dir, '-' * 89) if cfg.save_path != "" and best != -1: save_path = cfg.save_path torch.save( { 'epoch': epoch, 'checkpoint': model.state_dict(), 'best_f1': best_f1, 'optimizer': optimizer.state_dict() }, save_path, _use_new_zipfile_serialization=False) logging(cfg.log_dir, '| successfully save model at: {}'.format(save_path)) logging(cfg.log_dir, '-' * 89)
def train(args): os.chdir(get_original_cwd()) run_name = 'each' if args.each else 'agg' run_name += '_submit' if args.submit else '_cv' logging.info('start ' + run_name) seed_everything(args.seed) if args.each: v_sales_dict = joblib.load( '../data/05_preprocess/each_item/v_sales_dict.joblib') data_count = joblib.load( '../data/05_preprocess/each_item/data_count.joblib') dims = joblib.load('../data/05_preprocess/each_item/dims.joblib') weight = joblib.load('../data/06_weight/weight_each.joblib') te = joblib.load('../data/07_te/each_te.joblib') else: v_sales_dict = joblib.load( '../data/05_preprocess/agg_item/v_sales_dict.joblib') data_count = joblib.load( '../data/05_preprocess/agg_item/data_count.joblib') dims = joblib.load('../data/05_preprocess/agg_item/dims.joblib') weight = joblib.load('../data/06_weight/weight_agg.joblib') te = joblib.load('../data/07_te/agg_te.joblib') v_sales = next(iter(v_sales_dict.values())) drop_columns = [ 'sort_key', 'id', 'cat_id', 'd', 'release_date', 'date', 'weekday', 'year', 'week_of_month', 'holidy' ] if not args.use_prices: drop_columns += [ 'release_ago', 'sell_price', 'diff_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_trend', 'price_norm', 'diff_price_norm', 'price_nunique', 'dept_max', 'dept_min', 'dept_std', 'dept_mean', 'price_in_dept', 'mean_in_dept', 'cat_max', 'cat_min', 'cat_std', 'cat_mean', 'price_in_cat', 'mean_in_cat', 'price_in_month', 'price_in_year', ] cat_columns = [ 'aggregation_level', 'item_id', 'dept_id', 'store_id', 'state_id', 'month', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'day_of_week' ] features = [ col for col in v_sales.columns if col not in drop_columns + [TARGET] ] is_cats = [col in cat_columns for col in features] cat_dims = [] emb_dims = [] for col in features: if col in cat_columns: cat_dims.append(dims['cat_dims'][col]) emb_dims.append(dims['emb_dims'][col]) dims = pd.DataFrame({'cat_dims': cat_dims, 'emb_dims': emb_dims}) logging.info('data loaded') if args.submit: logging.info('train for submit') # train model for submission index = 1 if args.useval else 2 valid_term = 2 train_index = index if args.patience == 0 else (index + valid_term) trainset = M5Dataset(v_sales_dict, data_count, features, weight, te, remove_last4w=train_index, min_data_4w=0, over_sample=args.over_sample) validset = M5ValidationDataset(trainset.data_dict, weight, te, remove_last4w=index, term=valid_term) train_loader = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, worker_init_fn=get_worker_init_fn(args.seed)) valid_loader = torch.utils.data.DataLoader( validset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = M5MLPLSTMModel(is_cats, dims, n_hidden=args.n_hidden, dropout=args.dropout, use_te=args.use_te) criterion = M5Distribution(dist=args.dist, df=args.df) module = M5LightningModule(model, criterion, train_loader, valid_loader, None, args) trainer = M5Trainer(args.experiment, run_name, args.max_epochs, args.min_epochs, args.patience, args.val_check) trainer.fit(module) trainer.logger.experiment.log_artifact( trainer.logger.run_id, trainer.checkpoint_callback.kth_best_model) logging.info('predict') module.load_state_dict( torch.load( trainer.checkpoint_callback.kth_best_model)['state_dict']) # for reproducibility dmp_filename = '../data/cuda_rng_state_each.dmp' if args.each else '../data/cuda_rng_state_agg.dmp' torch.save(torch.cuda.get_rng_state(), dmp_filename) trainer.logger.experiment.log_artifact(trainer.logger.run_id, dmp_filename) val_acc, val_unc = predict(args, module, criterion, trainset.data_dict, weight, te, evaluation=False) eva_acc, eva_unc = predict(args, module, criterion, trainset.data_dict, weight, te, evaluation=True) submission_accuracy = pd.concat([val_acc, eva_acc]) submission_uncertainty = pd.concat([val_unc, eva_unc]) dump(submission_accuracy, submission_uncertainty, run_name) else: # local CV folds = list(range(3, -1, -1)) # [3, 2, 1, 0] for fold in folds: logging.info(f'train FOLD [{4-fold}/{len(folds)}]') valid_term = 2 if args.patience == 0: train_index = (fold + 1) * valid_term + 1 valid_index = (fold + 1) * valid_term + 1 test_index = fold * valid_term + 1 else: train_index = (fold + 2) * valid_term + 1 valid_index = (fold + 1) * valid_term + 1 test_index = fold * valid_term + 1 trainset = M5Dataset(v_sales_dict, data_count, features, weight, te, remove_last4w=train_index, over_sample=args.over_sample) validset = M5ValidationDataset(trainset.data_dict, weight, te, remove_last4w=valid_index, term=valid_term) testset = M5TestDataset(trainset.data_dict, weight, te, remove_last4w=test_index, term=valid_term) train_loader = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, worker_init_fn=get_worker_init_fn(args.seed)) valid_loader = torch.utils.data.DataLoader( validset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = torch.utils.data.DataLoader( testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = M5MLPLSTMModel(is_cats, dims, n_hidden=args.n_hidden, dropout=args.dropout, use_te=args.use_te) criterion = M5Distribution(dist=args.dist, df=args.df) module = M5LightningModule(model, criterion, train_loader, valid_loader, test_loader, args) fold_name = f'_{4-fold}-{len(folds)}' trainer = M5Trainer(args.experiment, run_name + fold_name, args.max_epochs, args.min_epochs, args.patience, args.val_check) trainer.fit(module) trainer.logger.experiment.log_artifact( trainer.logger.run_id, trainer.checkpoint_callback.kth_best_model) logging.info(f'test FOLD [{4-fold}/{len(folds)}]') module.load_state_dict( torch.load( trainer.checkpoint_callback.kth_best_model)['state_dict']) trainer.test() del trainset, validset, testset, train_loader, valid_loader, test_loader, model, criterion, module, trainer gc.collect()