def _load_target_and_source(key: str, source_base_path: str, dataset: str) -> pd.DataFrame: root = root_directory() df_source = pd.read_csv( f"{root}/{source_base_path}/{key}/{dataset}.source") df_target = pd.read_csv( f"{root}/{source_base_path}/{key}/{dataset}.target") return df_source.join(df_target)
def predict_model(dataset: Dataset, model: TextSummarizationModel, limit: Optional[int] = None, cache_base_path: str = CACHE_BASE_PATH) -> pd.DataFrame: """ Returns predictions for a dataset based on the validation set. :param dataset: The dataset to be used :param model: The model to be used to make predictions :param limit: The number of maximal predicted records :param cache_base_path The base directory for the cache :return: """ key = dataset.id root = root_directory() model_key = model.to_string() model_key_hashed = hashlib.md5(model_key.encode()).hexdigest() target_directory = f"{root}/{cache_base_path}/{key}" cached_file_path = f"{target_directory}/{model_key_hashed}.predictions" cached_file_info_path = f"{target_directory}/{model_key_hashed}.info" Path(target_directory).mkdir(parents=True, exist_ok=True) df = load_validation_as_df(key) if limit is not None: df = df[:limit] predictions = None # Load cached predictions if available if os.path.isfile(cached_file_path): predictions = pd.read_csv(cached_file_path) # Execute prediction of necessary (no cache available) if predictions is None or len(predictions.index) < len(df.index): start = time.time() predictions = model.predict_all(df) end = time.time() # Cache predictions predictions.to_csv(cached_file_path, index=False, header=True) with open(cached_file_info_path, 'w') as info_file: info = PredictionInfo(model=model.get_id(), dataset=dataset.id, count=predictions.shape[0], started=start, finished=end, elapsed=str(timedelta(seconds=end - start))) info = info.json() info_file.write(info) # Join predictions to text and reference summary df = df.join(predictions[:len(df.index)]) return df
def list_scores(cache_base_path: str = CACHE_BASE_PATH) -> List[ScoreInfo]: """ Reads the cached scores and returns them. Args: cache_base_path: he base directory for the cache Returns: The cached scores """ root = root_directory() target_directory = f"{root}/{cache_base_path}" return [ ScoreInfo.parse_file(f) for f in glob.iglob(target_directory + '/**/*.info', recursive=True) ]
def list_predictions( cache_base_path: str = CACHE_BASE_PATH) -> List[PredictionInfo]: """ Reads the cached predictions and returns the info for all predictions. Args: cache_base_path: The base directory for the cache Returns: The summaries. """ root = root_directory() target_directory = f"{root}/{cache_base_path}" return [ PredictionInfo.parse_file(f) for f in glob.iglob(target_directory + '/**/*.info', recursive=True) ]
def score_model(dataset: Dataset, model: TextSummarizationModel, limit: Optional[int] = None, rouge_n: int = 3, cache_base_path: str = CACHE_BASE_PATH) -> pd.DataFrame: """ Scores a dataset against a summarization model. :param dataset: The dataset for which the core should be calculated :param model: The model which should be scored :param limit: Optional count of rows which should be analyzed from dataset :param rouge_n The longest n-gram for which Rouge should be calculated :param cache_base_path The base directory for the cache :return: A data frame containing the `text`, `summary`, `summary_predicted` and the related scores. """ dataset_key = dataset.id rouge_variants = list( map(lambda n: f"rouge{n}", list(range(1, rouge_n + 1)) + ['L'])) scorer = rouge_scorer.RougeScorer(rouge_variants) def score_row(row) -> dict: row_scores = scorer.score(row['summary'], row['summary_predicted']) for r in rouge_variants: prefix = r.replace('rouge', 'r') row[f"{prefix}p"] = row_scores[r].precision row[f"{prefix}r"] = row_scores[r].recall row[f"{prefix}f"] = row_scores[r].fmeasure return row root = root_directory() model_key = model.to_string() model_key_hashed = hashlib.md5(model_key.encode()).hexdigest() target_directory = f"{root}/{cache_base_path}/{dataset_key}" cached_file_path = f"{target_directory}/{model_key_hashed}.scores" cached_file_info_path = f"{target_directory}/{model_key_hashed}.info" Path(target_directory).mkdir(parents=True, exist_ok=True) scores = None predictions = predict_model(dataset, model, limit) if os.path.isfile(cached_file_path): scores = pd.read_csv(cached_file_path).drop( ['text', 'summary', 'summary_predicted'], axis=1) if scores is None or len(scores.index) < len(predictions.index): start = time.time() print( f"Scoring {len(predictions.index)} predictions of dataset `{dataset_key}` with model `{model.get_label()}` ..." ) scores = predictions.progress_apply(score_row, axis=1) end = time.time() # Cache predictions scores.to_csv(cached_file_path, index=False, header=True) with open(cached_file_info_path, 'w') as info_file: scores_summary = {} for r in rouge_variants: prefix = r.replace('rouge', 'r') scores_summary[f"{prefix}p"] = round( scores[f"{prefix}p"].mean(), 4) scores_summary[f"{prefix}r"] = round( scores[f"{prefix}r"].mean(), 4) scores_summary[f"{prefix}f"] = round( scores[f"{prefix}f"].mean(), 4) info = ScoreInfo(model=model.get_id(), dataset=dataset.id, count=scores.shape[0], started=start, finished=end, elapsed=str(timedelta(seconds=end - start)), scores=scores_summary) info = info.json() info_file.write(info) return scores