def _load_target_and_source(key: str, source_base_path: str,
                            dataset: str) -> pd.DataFrame:
    root = root_directory()
    df_source = pd.read_csv(
        f"{root}/{source_base_path}/{key}/{dataset}.source")
    df_target = pd.read_csv(
        f"{root}/{source_base_path}/{key}/{dataset}.target")

    return df_source.join(df_target)
def predict_model(dataset: Dataset,
                  model: TextSummarizationModel,
                  limit: Optional[int] = None,
                  cache_base_path: str = CACHE_BASE_PATH) -> pd.DataFrame:
    """
    Returns predictions for a dataset based on the validation set.

    :param dataset: The dataset to be used
    :param model: The model to be used to make predictions
    :param limit: The number of maximal predicted records
    :param cache_base_path The base directory for the cache
    :return:
    """
    key = dataset.id
    root = root_directory()
    model_key = model.to_string()
    model_key_hashed = hashlib.md5(model_key.encode()).hexdigest()
    target_directory = f"{root}/{cache_base_path}/{key}"
    cached_file_path = f"{target_directory}/{model_key_hashed}.predictions"
    cached_file_info_path = f"{target_directory}/{model_key_hashed}.info"
    Path(target_directory).mkdir(parents=True, exist_ok=True)

    df = load_validation_as_df(key)

    if limit is not None:
        df = df[:limit]

    predictions = None

    # Load cached predictions if available
    if os.path.isfile(cached_file_path):
        predictions = pd.read_csv(cached_file_path)

    # Execute prediction of necessary (no cache available)
    if predictions is None or len(predictions.index) < len(df.index):
        start = time.time()
        predictions = model.predict_all(df)
        end = time.time()

        # Cache predictions
        predictions.to_csv(cached_file_path, index=False, header=True)
        with open(cached_file_info_path, 'w') as info_file:
            info = PredictionInfo(model=model.get_id(),
                                  dataset=dataset.id,
                                  count=predictions.shape[0],
                                  started=start,
                                  finished=end,
                                  elapsed=str(timedelta(seconds=end - start)))
            info = info.json()
            info_file.write(info)

    # Join predictions to text and reference summary
    df = df.join(predictions[:len(df.index)])

    return df
示例#3
0
def list_scores(cache_base_path: str = CACHE_BASE_PATH) -> List[ScoreInfo]:
    """
    Reads the cached scores and returns them.

    Args:
        cache_base_path: he base directory for the cache

    Returns:
        The cached scores
    """
    root = root_directory()
    target_directory = f"{root}/{cache_base_path}"
    return [
        ScoreInfo.parse_file(f)
        for f in glob.iglob(target_directory + '/**/*.info', recursive=True)
    ]
def list_predictions(
        cache_base_path: str = CACHE_BASE_PATH) -> List[PredictionInfo]:
    """
    Reads the cached predictions and returns the info for all predictions.

    Args:
        cache_base_path: The base directory for the cache

    Returns:
        The summaries.
    """
    root = root_directory()
    target_directory = f"{root}/{cache_base_path}"
    return [
        PredictionInfo.parse_file(f)
        for f in glob.iglob(target_directory + '/**/*.info', recursive=True)
    ]
示例#5
0
def score_model(dataset: Dataset,
                model: TextSummarizationModel,
                limit: Optional[int] = None,
                rouge_n: int = 3,
                cache_base_path: str = CACHE_BASE_PATH) -> pd.DataFrame:
    """
    Scores a dataset against a summarization model.

    :param dataset: The dataset for which the core should be calculated
    :param model: The model which should be scored
    :param limit: Optional count of rows which should be analyzed from dataset
    :param rouge_n The longest n-gram for which Rouge should be calculated
    :param cache_base_path The base directory for the cache
    :return: A data frame containing the `text`, `summary`, `summary_predicted` and the related scores.
    """

    dataset_key = dataset.id
    rouge_variants = list(
        map(lambda n: f"rouge{n}",
            list(range(1, rouge_n + 1)) + ['L']))
    scorer = rouge_scorer.RougeScorer(rouge_variants)

    def score_row(row) -> dict:
        row_scores = scorer.score(row['summary'], row['summary_predicted'])

        for r in rouge_variants:
            prefix = r.replace('rouge', 'r')
            row[f"{prefix}p"] = row_scores[r].precision
            row[f"{prefix}r"] = row_scores[r].recall
            row[f"{prefix}f"] = row_scores[r].fmeasure

        return row

    root = root_directory()
    model_key = model.to_string()
    model_key_hashed = hashlib.md5(model_key.encode()).hexdigest()
    target_directory = f"{root}/{cache_base_path}/{dataset_key}"
    cached_file_path = f"{target_directory}/{model_key_hashed}.scores"
    cached_file_info_path = f"{target_directory}/{model_key_hashed}.info"
    Path(target_directory).mkdir(parents=True, exist_ok=True)

    scores = None
    predictions = predict_model(dataset, model, limit)

    if os.path.isfile(cached_file_path):
        scores = pd.read_csv(cached_file_path).drop(
            ['text', 'summary', 'summary_predicted'], axis=1)

    if scores is None or len(scores.index) < len(predictions.index):
        start = time.time()
        print(
            f"Scoring {len(predictions.index)} predictions of dataset `{dataset_key}` with model `{model.get_label()}` ..."
        )
        scores = predictions.progress_apply(score_row, axis=1)
        end = time.time()

        # Cache predictions
        scores.to_csv(cached_file_path, index=False, header=True)
        with open(cached_file_info_path, 'w') as info_file:
            scores_summary = {}

            for r in rouge_variants:
                prefix = r.replace('rouge', 'r')
                scores_summary[f"{prefix}p"] = round(
                    scores[f"{prefix}p"].mean(), 4)
                scores_summary[f"{prefix}r"] = round(
                    scores[f"{prefix}r"].mean(), 4)
                scores_summary[f"{prefix}f"] = round(
                    scores[f"{prefix}f"].mean(), 4)

            info = ScoreInfo(model=model.get_id(),
                             dataset=dataset.id,
                             count=scores.shape[0],
                             started=start,
                             finished=end,
                             elapsed=str(timedelta(seconds=end - start)),
                             scores=scores_summary)
            info = info.json()
            info_file.write(info)

    return scores