def set_eval_inference_latency_mode(self): """ Evaluate Inference Latency Mode - Pipeline 1. read raw_data (DataReader) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) _, helpers = data_reader.read() raw_examples = helpers["valid"]["examples"] cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) return raw_examples, raw_to_tensor_fn
def test(config): NSML_SESSEION = 'team_6/19_tcls_qa/80' # NOTE: need to hard code NSML_CHECKPOINT = '13800' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" set_global_seed(config.seed_num) token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers data_reader = create_by_factory(DataReaderFactory, config.data_reader) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device) trainer = Trainer(model, metric_key="f1") if nsml.IS_ON_NSML: bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn) if config.nsml.pause: nsml.paused(scope=locals())
def set_predict_mode(self, preload=False): """ Predict Mode - Pipeline 1. read raw_data (Argument) 2. load vocabs from checkpoint (DataReader, Token) 3. define raw_to_tensor_fn (DataReader, Token) 4. define and load model 5. run! """ data_reader, token_makers = self._create_data_and_token_makers() # Token & Vocab vocabs = utils.load_vocabs(self.model_checkpoint) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) text_handler = TextHandler(token_makers, lazy_indexing=False) # Set predict config if self.argument.interactive: raw_features = { feature_name: "" for feature_name in data_reader.text_columns } else: raw_features = {} for feature_name in data_reader.text_columns: feature = getattr(self.argument, feature_name, None) # if feature is None: # raise ValueError(f"--{feature_name} argument is required!") raw_features[feature_name] = feature cuda_device = self.config.cuda_devices[ 0] if self.config.use_gpu else None raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=cuda_device, helper=self.model_checkpoint.get("predict_helper", {})) # Model model = self._create_model(token_makers, checkpoint=self.model_checkpoint) self.set_trainer(model) arguments = vars(self.argument) if preload: self.predict_settings = { "raw_to_tensor_fn": raw_to_tensor_fn, "arguments": arguments } else: return raw_features, raw_to_tensor_fn, arguments
def re_train_and_evaluate(config): NSML_SESSEION = 'team_6/19_tcls_qa/258' # NOTE: need to hard code NSML_CHECKPOINT = '1' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers if nsml.IS_ON_NSML: config.data_reader.train_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.train_file_path) config.data_reader.valid_file_path = os.path.join( DATASET_PATH, "train", "train_data", config.data_reader.valid_file_path) data_reader = create_by_factory(DataReaderFactory, config.data_reader) datas, helpers = data_reader.read() # Vocab & Indexing text_handler = TextHandler(token_makers, lazy_indexing=True) texts = data_reader.filter_texts(datas) token_counters = text_handler.make_token_counters(texts) text_handler.build_vocabs(token_counters) text_handler.index(datas, data_reader.text_columns) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Iterator datasets = data_reader.convert_to_dataset(datas, helpers=helpers) train_loader = create_data_loader(datasets["train"], batch_size=config.iterator.batch_size, shuffle=True, cuda_device_id=device) valid_loader = create_data_loader(datasets["valid"], batch_size=config.iterator.batch_size, shuffle=False, cuda_device_id=device) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device, helpers=helpers) model_parameters = [ param for param in model.parameters() if param.requires_grad ] optimizer = get_optimizer_by_name("adam")(model_parameters) def bind_load_model(config, model, **kwargs): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["weights"]) model.config = checkpoint["config"] model.metrics = checkpoint["metrics"] model.init_params = checkpoint["init_params"], model.predict_helper = checkpoint["predict_helper"], model.train_counter = TrainCounter(display_unit=1000) # model.vocabs = load_vocabs(checkpoint) if "optimizer" in kwargs: kwargs["optimizer"].load_state_dict(checkpoint["optimizer"][0]) print(f"Model reload checkpoints...! {checkpoint_path}") nsml.bind(load=load) bind_load_model(config, model, optimizer=optimizer) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) if IS_ON_NSML: bind_nsml(model, optimizer=optimizer) # Trainer trainer_config = vars(config.trainer) trainer_config["model"] = model trainer = Trainer(**trainer_config) trainer.train_and_evaluate(train_loader, valid_loader, optimizer)