def re_train_and_evaluate(config): NSML_SESSEION = 'team_6/19_tcls_qa/258' # NOTE: need to hard code NSML_CHECKPOINT = '1' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers if nsml.IS_ON_NSML: config.data_reader.train_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.train_file_path) config.data_reader.valid_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.valid_file_path) data_reader = create_by_factory(DataReaderFactory, config.data_reader) datas, helpers = data_reader.read() # Vocab & Indexing text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) train_loader = create_data_loader(datasets["train"], batch_size=config.iterator.batch_size, shuffle=True, cuda_device_id=device) valid_loader = create_data_loader(datasets["valid"], batch_size=config.iterator.batch_size, shuffle=False, cuda_device_id=device) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device, helpers=helpers) model_parameters = [ param for param in model.parameters() if param.requires_grad ] optimizer = get_optimizer_by_name("adam")(model_parameters) def bind_load_model(config, model, **kwargs): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["weights"]) model.config = checkpoint["config"] model.metrics = checkpoint["metrics"] model.init_params = checkpoint["init_params"], model.predict_helper = checkpoint["predict_helper"], model.train_counter = TrainCounter(display_unit=1000) # model.vocabs = load_vocabs(checkpoint) if "optimizer" in kwargs: kwargs["optimizer"].load_state_dict(checkpoint["optimizer"][0]) print(f"Model reload checkpoints...! {checkpoint_path}") nsml.bind(load=load) bind_load_model(config, model, optimizer=optimizer) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) if IS_ON_NSML: bind_nsml(model, optimizer=optimizer) # Trainer trainer_config = vars(config.trainer) trainer_config["model"] = model trainer = Trainer(**trainer_config) trainer.train_and_evaluate(train_loader, valid_loader, optimizer)
class Experiment: """ Experiment settings with config. * Args: mode: Mode (ex. TRAIN, EVAL, INFER_EVAL, PREDICT) config: (NestedNamespace) Argument config according to mode """ def __init__(self, mode, config): common_utils.set_logging_config(mode, config) self.argument = ( config ) # self.config (experiment overall config) / config (argument according to mode) self.config = config self.mode = mode self.common_setting(mode, config) if mode != Mode.TRAIN: # evaluate and predict self.load_setting() # Set evaluation config if mode.endswith(Mode.EVAL): self.config.data_reader.train_file_path = "" self.config.data_reader.valid_file_path = self.argument.data_file_path self.config.cuda_devices = self.argument.cuda_devices self.config.iterator.cuda_devices = self.argument.cuda_devices if getattr(self.argument, "inference_latency", None): self.config.max_latency = self.argument.inference_latency self.predict_settings = None def common_setting(self, mode, config): """ Common Setting - experiment config, use_gpu and cuda_device_ids """ self.config_dict = convert_config2dict(config) cuda_devices = self._get_cuda_devices() self.config.cuda_devices = cuda_devices self.config.slack_url = getattr(self.config, "slack_url", False) def _get_cuda_devices(self): if getattr(self.config, "use_gpu", None) is None: self.config.use_gpu = torch.cuda.is_available() or nsml.IS_ON_NSML if self.config.use_gpu: if nsml.IS_ON_NSML: return list(range(self.config.gpu_num)) else: return self.config.cuda_devices else: return None def load_setting(self): """ Load Setting - need to load checkpoint case (ex. evaluate and predict) """ cuda_devices = self.argument.cuda_devices checkpoint_path = self.argument.checkpoint_path prev_cuda_device_id = getattr(self.argument, "prev_cuda_device_id", None) self.model_checkpoint = self._read_checkpoint( cuda_devices, checkpoint_path, prev_cuda_device_id=prev_cuda_device_id) self._set_saved_config(cuda_devices) def _read_checkpoint(self, cuda_devices, checkpoint_path, prev_cuda_device_id=None): if cuda_devices == "cpu": return torch.load(checkpoint_path, map_location="cpu") # use CPU if torch.cuda.is_available(): checkpoint = torch.load( checkpoint_path, map_location={ f"cuda:{prev_cuda_device_id}": f"cuda:{cuda_devices[0]}" }, # different cuda_device id case (save/load) ) else: checkpoint = torch.load(checkpoint_path, map_location="cpu") # use CPU return checkpoint def _set_saved_config(self, cuda_devices): saved_config_dict = self.model_checkpoint["config"] self.config_dict = saved_config_dict logger.info("Load saved_config ...") logger.info(pretty_json_dumps(saved_config_dict)) saved_config = NestedNamespace() saved_config.load_from_json(saved_config_dict) is_use_gpu = self.config.use_gpu self.config = saved_config self.config.use_gpu = is_use_gpu self.config.cuda_devices = cuda_devices def __call__(self): """ Run Trainer """ set_global_seed(self.config.seed_num) # For Reproducible if self.mode == Mode.TRAIN: # exit trigger slack notification if self.config.slack_url: atexit.register(utils.send_message_to_slack) train_loader, valid_loader, optimizer = self.set_train_mode() assert train_loader is not None assert optimizer is not None if valid_loader is None: self.trainer.train(train_loader, optimizer) else: self.trainer.train_and_evaluate(train_loader, valid_loader, optimizer) self._summary_experiments() elif self.mode == Mode.EVAL: valid_loader = self.set_eval_mode() assert valid_loader is not None return self.trainer.evaluate(valid_loader) elif self.mode == Mode.INFER_EVAL: raw_examples, raw_to_tensor_fn = self.set_eval_inference_latency_mode( ) assert raw_examples is not None assert raw_to_tensor_fn is not None return self.trainer.evaluate_inference_latency( raw_examples, raw_to_tensor_fn, max_latency=self.config.max_latency) elif self.mode.endswith(Mode.PREDICT): raw_features, raw_to_tensor_fn, arguments = self.set_predict_mode() assert raw_features is not None assert raw_to_tensor_fn is not None return self.trainer.predict( raw_features, raw_to_tensor_fn, arguments, interactive=arguments.get("interactive", False), ) else: raise ValueError(f"unknown mode: {self.mode}") def set_train_mode(self): """ Training Mode - Pipeline 1. read raw_data (DataReader) 2. build vocabs (DataReader, Token) 3. indexing tokens (DataReader, Token) 4. convert to DataSet (DataReader) 5. create DataLoader (DataLoader) 6. define model and optimizer 7. run! """ logger.info("Config. \n" + pretty_json_dumps(self.config_dict) + "\n") data_reader, token_makers = self._create_data_and_token_makers() datas, helpers = data_reader.read() # Token & Vocab text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts, config=self.config) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) # iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) # with name self.config.iterator.cuda_devices = self.config.cuda_devices train_loader, valid_loader, test_loader = self._create_by_factory( DataLoaderFactory, self.config.iterator, param={"datasets": datasets}) # calculate 'num_train_steps' num_train_steps = self._get_num_train_steps(train_loader) self.config.optimizer.num_train_steps = num_train_steps checkpoint_dir = Path(self.config.trainer.log_dir) / "checkpoint" checkpoints = None if checkpoint_dir.exists(): checkpoints = self._load_exist_checkpoints( checkpoint_dir) # contain model and optimizer if checkpoints is None: model = self._create_model(token_makers, helpers=helpers) op_dict = self._create_by_factory(OptimizerFactory, self.config.optimizer, param={"model": model}) else: model = self._create_model(token_makers, checkpoint=checkpoints) op_dict = self._create_by_factory(OptimizerFactory, self.config.optimizer, param={"model": model}) utils.load_optimizer_checkpoint(op_dict["optimizer"], checkpoints) self.set_trainer(model, op_dict=op_dict) return train_loader, valid_loader, op_dict["optimizer"] def _create_data_and_token_makers(self): token_makers = self._create_by_factory(TokenMakersFactory, self.config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] self.config.data_reader.tokenizers = tokenizers data_reader = self._create_by_factory(DataReaderFactory, self.config.data_reader) return data_reader, token_makers def _create_by_factory(self, factory, item_config, param={}): return factory(item_config).create(**param) def _get_num_train_steps(self, train_loader): train_set_size = len(train_loader.dataset) batch_size = self.config.iterator.batch_size gradient_accumulation_steps = getattr(self.config.optimizer, "gradient_accumulation_steps", 1) num_epochs = self.config.trainer.num_epochs one_epoch_steps = int(train_set_size / batch_size / gradient_accumulation_steps) if one_epoch_steps == 0: one_epoch_steps = 1 num_train_steps = one_epoch_steps * num_epochs return num_train_steps def _load_exist_checkpoints(self, checkpoint_dir): # pragma: no cover checkpoints = utils.get_sorted_path(checkpoint_dir, both_exist=True) train_counts = list(checkpoints.keys()) if not train_counts: return None seperator = "-" * 50 message = f"{seperator}\n !! Find exist checkpoints {train_counts}.\n If you want to recover, input train_count in list.\n If you don't want to recover, input 0.\n{seperator}" selected_train_count = common_utils.get_user_input(message) if selected_train_count == 0: return None model_path = checkpoints[selected_train_count]["model"] model_checkpoint = self._read_checkpoint(self.config.cuda_devices, model_path) optimizer_path = checkpoints[selected_train_count]["optimizer"] optimizer_checkpoint = self._read_checkpoint("cpu", optimizer_path) checkpoints = {} checkpoints.update(model_checkpoint) checkpoints.update(optimizer_checkpoint) return checkpoints def _create_model(self, token_makers, checkpoint=None, helpers=None): if checkpoint is None: assert helpers is not None first_key = next(iter(helpers)) helper = helpers[first_key] # get first helper model_init_params = helper.get("model", {}) predict_helper = helper.get("predict_helper", {}) else: model_init_params = checkpoint.get("init_params", {}) predict_helper = checkpoint.get("predict_helper", {}) model_params = {"token_makers": token_makers} model_params.update(model_init_params) model = self._create_by_factory(ModelFactory, self.config.model, param=model_params) # Save params model.init_params = model_init_params model.predict_helper = predict_helper if checkpoint is not None: model = utils.load_model_checkpoint(model, checkpoint) model = self._set_gpu_env(model) return model def _set_gpu_env(self, model): if self.config.use_gpu: cuda_devices = self._get_cuda_devices() num_gpu = len(cuda_devices) use_multi_gpu = num_gpu > 1 if use_multi_gpu: model = torch.nn.DataParallel(model, device_ids=cuda_devices) model.cuda() else: num_gpu = 0 num_gpu_state = num_gpu if num_gpu > 1: num_gpu_state += " (Multi-GPU)" logger.info( f"use_gpu: {self.config.use_gpu} num_gpu: {num_gpu_state}, distributed training: False, 16-bits trainiing: False" ) return model def set_trainer(self, model, op_dict={}, save_params={}): trainer_config = vars(self.config.trainer) trainer_config["config"] = self.config_dict trainer_config["model"] = model trainer_config["learning_rate_scheduler"] = op_dict.get( "learning_rate_scheduler", None) trainer_config["exponential_moving_average"] = op_dict.get( "exponential_moving_average", None) self.trainer = Trainer(**trainer_config) # Set NSML if nsml.IS_ON_NSML: utils.bind_nsml(model, optimizer=op_dict.get("optimizer", None)) if getattr(self.config.nsml, "pause", None): nsml.paused(scope=locals()) def _summary_experiments(self): hr_text = "-" * 50 summary_logs = f"\n\n\nExperiment Summary. {nsml.SESSION_NAME}\n{hr_text}\n" summary_logs += f"Config.\n{pretty_json_dumps(self.config_dict)}\n{hr_text}\n" summary_logs += ( f"Training Logs.\n{pretty_json_dumps(self.trainer.training_logs)}\n{hr_text}\n" ) summary_logs += f"Metric Logs.\n{pretty_json_dumps(self.trainer.metric_logs)}" logger.info(summary_logs) if self.config.slack_url: # pragma: no cover simple_summary_title = f"Session Name: {nsml.SESSION_NAME} " if getattr(self.config, "base_config", None): simple_summary_title += f"({self.config.base_config})" simple_summary_logs = f" - Dataset: {self.config.data_reader.dataset} \n" simple_summary_logs += f" - Model: {self.config.model.name}" best_metrics = {"epoch": self.trainer.metric_logs["best_epoch"]} best_metrics.update(self.trainer.metric_logs["best"]) simple_summary_logs += f" - Best metrics.\n {pretty_json_dumps(best_metrics)} " utils.send_message_to_slack(self.config.slack_url, title=simple_summary_title, message=simple_summary_logs) def set_eval_mode(self): """ Evaluate Mode - Pipeline 1. read raw_data (DataReader) 2. load vocabs from checkpoint (DataReader, Token) 3. indexing tokens (DataReader, Token) 4. convert to DataSet (DataReader) 5. create DataLoader (DataLoader) 6. define and load model 7. run! """ data_reader, token_makers = self._create_data_and_token_makers() # DataReader datas, helpers = data_reader.read() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) text_handler.index(datas, data_reader.text_columns) # iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) # with name self.config.iterator.cuda_devices = self.config.cuda_devices _, valid_loader, _ = self._create_by_factory( DataLoaderFactory, self.config.iterator, param={"datasets": datasets}) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) return valid_loader def set_eval_inference_latency_mode(self): """ Evaluate Inference Latency Mode - Pipeline 1. read raw_data (DataReader) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) _, helpers = data_reader.read() raw_examples = helpers["valid"]["examples"] cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) return raw_examples, raw_to_tensor_fn def predict(self, raw_features): if self.predict_settings is None: raise ValueError( "To use 'predict()', you must call 'set_predict_mode()' first, with preload=True parameter" ) raw_to_tensor_fn = self.predict_settings["raw_to_tensor_fn"] arguments = self.predict_settings["arguments"] arguments.update(raw_features) assert raw_features is not None assert raw_to_tensor_fn is not None return self.trainer.predict( raw_features, raw_to_tensor_fn, arguments, interactive=arguments.get("interactive", False), ) def set_predict_mode(self, preload=False): """ Predict Mode - Pipeline 1. read raw_data (Argument) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) # Set predict config if self.argument.interactive: raw_features = { feature_name: "" for feature_name in data_reader.text_columns } else: raw_features = {} for feature_name in data_reader.text_columns: feature = getattr(self.argument, feature_name, None) # if feature is None: # raise ValueError(f"--{feature_name} argument is required!") raw_features[feature_name] = feature cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device, helper=self.model_checkpoint.get("predict_helper", {})) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) arguments = vars(self.argument) if preload: self.predict_settings = { "raw_to_tensor_fn": raw_to_tensor_fn, "arguments": arguments } else: return raw_features, raw_to_tensor_fn, arguments