Пример #1
0
def cli_download(
    from_file: click.File, url_list: Tuple[str, ...], verbose: bool = False
) -> None:
    urls = [(x, None) for x in url_list]
    if from_file:
        urls += [(x.strip("\n"), None) for x in from_file.readlines()]
    download(urls, verbose)
Пример #2
0
def cli_simple(
    from_file: click.File,
    author: str,
    title: str,
    url_list: Tuple[str, ...],
    verbose: bool = False,
):
    urls = [x for x in url_list]
    if from_file:
        urls += [x.strip("\n") for x in from_file.readlines()]
    if not urls:
        click.echo("You must provide at least one URL to download.")
        return
    story = HTMLStory(
        chapters=urls,
        author=author,
        title=title,
        url=furl("http://httpbin.org/status/200"),
        verbose=verbose,
    )
    story.run()
Пример #3
0
def readlines(ctx, param, file: click.File) -> List[str]:
    return [l.strip() for l in file.readlines()]
Пример #4
0
def compare(
    source: click.File,
    system_x: click.File,
    system_y: click.File,
    reference: click.File,
    language: str,
    metric: Union[Tuple[str], str],
    filter: Union[Tuple[str], str],
    length_min_val: float,
    length_max_val: float,
    seg_metric: str,
    output_folder: str,
    bootstrap: bool,
    num_splits: int,
    sample_ratio: float,
):
    testset = PairwiseTestset(
        src=[l.strip() for l in source.readlines()],
        system_x=[l.strip() for l in system_x.readlines()],
        system_y=[l.strip() for l in system_y.readlines()],
        ref=[l.strip() for l in reference.readlines()],
        language_pair="X-" + language,
        filenames=[source.name, system_x.name, system_y.name, reference.name],
    )
    corpus_size = len(testset)
    if filter:
        filters = [
            available_filters[f](testset) for f in filter if f != "length"
        ]
        if "length" in filter:
            filters.append(available_filters["length"](
                testset, int(length_min_val * 100), int(length_max_val * 100)))

        for filter in filters:
            testset.apply_filter(filter)

        if (1 - (len(testset) / corpus_size)) * 100 == 100:
            click.secho("The current filters reduce the Corpus on 100%!",
                        fg="ref")
            return

        click.secho(
            "Filters Successfully applied. Corpus reduced in {:.2f}%.".format(
                (1 - (len(testset) / corpus_size)) * 100),
            fg="green",
        )

    if seg_metric not in metric:
        metric = tuple([
            seg_metric,
        ] + list(metric))
    else:
        # Put COMET in first place
        metric = list(metric)
        metric.remove(seg_metric)
        metric = tuple([
            seg_metric,
        ] + metric)

    results = {
        m: available_metrics[m](
            language=testset.target_language).pairwise_comparison(testset)
        for m in metric
    }

    # results_dict = PairwiseResult.results_to_dict(list(results.values()))
    results_df = PairwiseResult.results_to_dataframe(list(results.values()))
    if bootstrap:
        bootstrap_results = []
        for m in metric:
            bootstrap_result = available_metrics[m].bootstrap_resampling(
                testset, num_splits, sample_ratio, results[m])
            bootstrap_results.append(available_metrics[m].bootstrap_resampling(
                testset, num_splits, sample_ratio, results[m]).stats)
        bootstrap_results = {
            k: [dic[k] for dic in bootstrap_results]
            for k in bootstrap_results[0]
        }
        for k, v in bootstrap_results.items():
            results_df[k] = v

    click.secho(str(results_df), fg="yellow")
    if output_folder != "":
        if not output_folder.endswith("/"):
            output_folder += "/"
        results_df.to_json(output_folder + "results.json",
                           orient="index",
                           indent=4)
        plot_segment_comparison(results[seg_metric], output_folder)
        plot_pairwise_distributions(results[seg_metric], output_folder)
        plot_bucket_comparison(results[seg_metric], output_folder)