def main(args): set_seed(args.seed) elapsed, context_predictions = context_selection(args, args.context_config_json) logger.info( f"Finished context selection, get {len(context_predictions)} paragraphs" f"({elapsed:.2f}s elapsed)") elapsed, qa_predictions = question_answering(args, args.qa_config_json, context_predictions) logger.info( f"Finished question_answering, get {len(qa_predictions)} answer spans" f"({elapsed:.2f}s elapsed)") predictions = { d["id"]: d["answer"] for d in map(postprocess, qa_predictions) } json_dump(predictions, args.predict_json, ensure_ascii=False) if args.tmp_dir: args.tmp_dir.mkdir(parents=True, exist_ok=True) pickle_dump(context_predictions, args.tmp_dir / "context_predictions.pkl") pickle_dump(qa_predictions, args.tmp_dir / "qa_predictions.pkl")
def saveReconstructions(reconstructions, fn): """ :type reconstructions: list[scripts.myopensfm.types.Reconstruction] :type fn: str """ # with open(fn, 'wb') as f: with open(fn, 'w') as f: # for python3 mylogger.logger.info("saving " + fn) obj = io.reconstructions_to_json(reconstructions) io.json_dump(obj, f) mylogger.logger.info("Done")
def main(args): set_seed(args.seed) logger.info( f"Loading training data from {args.dataset_dir / 'train.json'}...") all_data = json_load(args.dataset_dir / "train.json") logger.info("Random shuffling the data...") random.shuffle(all_data) train_size = int(args.train_ratio * len(all_data)) val_size = len(all_data) - train_size logger.info(f"Splitting the dataset into [{train_size}, {val_size}] sizes") train_data, val_data = all_data[:train_size], all_data[train_size:] json_dump(train_data, args.dataset_dir / "train_splitted.json") json_dump(val_data, args.dataset_dir / "val_splitted.json")
def train(self, train_dataloader, val_dataloader): logger.info(f"Training model for {self.total_epochs} epochs...") training_log = [] for epoch in range(self.cur_epoch, self.total_epochs + 1): self.cur_epoch = epoch logger.info(f"Epoch {epoch:2d} / {self.total_epochs:2d}") train_time, (train_loss, train_metrics) = self.run_epoch(train_dataloader, split="train", train=True, epoch=epoch) logger.info( f"Train | {train_time:8.3f}s | loss: {train_loss:.3f} | " f"{self.format_metrics(train_metrics)}") val_time, (val_loss, val_metrics) = self.run_epoch(val_dataloader, split="val", epoch=epoch) logger.info(f"Val | {val_time:8.3f}s | loss: {val_loss:.3f} | " f"{self.format_metrics(val_metrics)}") if epoch % self.checkpoint_freq == 0: try: # In order to bypass meow1 / meow2 disk full issue self.save_checkpoint(self.checkpoint_dir / f"checkpoint_{epoch:03d}.pt") except OSError as e: logger.warning( f"Trying to save a checkpoint, but '{e}' occured") training_log.append({ "epoch": epoch, "train_time": train_time, "train_loss": train_loss, "train_metrics": train_metrics, "val_time": val_time, "val_loss": val_loss, "val_metrics": val_metrics, }) self.model.save_weights(self.checkpoint_dir / "model_weights.pt") json_dump(training_log, self.checkpoint_dir / "train_log.json")
def __init__( self, contexts: List[str], data: List[dict], tokenizer: Optional[BertTokenizer] = None, test: bool = False, include_nonrelevant=0, split_name: str = "no_name", cache_dir: Optional[Path] = None, skip_preprocess: Optional[bool] = False, ): super().__init__() self._contexts = contexts self._raw_data = data self.tokenizer = tokenizer self.test = test self.split_name = split_name if skip_preprocess: return cache_path = ((cache_dir / f"_{split_name}_preprocessed_{include_nonrelevant}.json" ) if cache_dir and split_name else None) if cache_path and cache_path.is_file(): logger.info( f"Loading cached preprocessed dataset from {cache_path}...") self.data = json_load(cache_path) else: self.data = self.preprocess_dataset( self.tokenizer, contexts, data, include_nonrelevant=include_nonrelevant, test=self.test, ) if cache_path: logger.info( f"Saving cached preprocessed dataset to {cache_path}...") json_dump(self.data, cache_path)
def train(self, train_dataloader, val_dataloader): logger.info(f"Training model for {self.total_epochs} epochs...") training_log = [] for epoch in range(self.cur_epoch, self.total_epochs + 1): self.cur_epoch = epoch logger.info(f"Epoch {epoch:2d} / {self.total_epochs:2d}") train_time, (train_loss, train_metrics) = self.run_epoch( train_dataloader, split="train", train=True, epoch=epoch ) logger.info( f"Train | {train_time:7.3f}s | loss: {train_loss:.3f} | " f"{self.format_metrics(train_metrics)}" ) val_time, (val_loss, val_metrics) = self.run_epoch( val_dataloader, split="val", epoch=epoch ) logger.info( f"Val | {val_time:7.3f}s | loss: {val_loss:.3f} | " f"{self.format_metrics(val_metrics)}" ) if epoch % self.checkpoint_freq == 0: self.save_checkpoint(self.checkpoint_dir / f"checkpoint_{epoch:03d}.pt") training_log.append( { "epoch": epoch, "train_time": train_time, "train_loss": train_loss, "train_metrics": train_metrics, "val_time": val_time, "val_loss": val_loss, "val_metrics": val_metrics, } ) self.model.save_weights(self.checkpoint_dir / "model_weights.pt") json_dump(training_log, self.checkpoint_dir / "train_log.json")