Пример #1
0
def view_stats(
    sources: PathOrPathsOrDictOfStrList,
    references: PathOrPathsOrDictOfStrList,
    tags: Optional[PathOrPathsOrDictOfStrList] = None,
):
    _src = VizSeqDataSources(sources, text_merged=True)
    _ref = VizSeqDataSources(references, text_merged=True)
    _tags = None if tags is None else VizSeqDataSources(tags, text_merged=True)
    stats = VizSeqStats.get(_src, _ref, _tags)

    html = env.get_template('ipynb_stats.html').render(
        stats=stats.to_dict(formatting=True),
        enum_src_names_and_types=VizSeqDataPageView.get_enum(
            zip(_src.names, [t.name.title() for t in _src.data_types])),
        enum_ref_names=VizSeqDataPageView.get_enum(_ref.names))
    display(HTML(html))

    n_src_plots = len(_src.text_indices)
    n_plots = n_src_plots + _ref.n_sources
    fig, ax = plt.subplots(nrows=1, ncols=n_plots, figsize=(7 * n_plots, 5))
    for i, idx in enumerate(_src.text_indices):
        cur_ax = ax if n_plots == 1 else ax[i]
        name = _src.names[idx]
        cur_sent_lens = stats.src_lens[name]
        _ = cur_ax.hist(cur_sent_lens, density=True, bins=25)
        _ = cur_ax.axvline(x=np.mean(cur_sent_lens), color='red', linewidth=3)
        cur_ax.set_title(f'Source {name} Length')
    for i, idx in enumerate(_ref.text_indices):
        cur_ax = ax if n_plots == 1 else ax[n_src_plots + i]
        name = _ref.names[idx]
        cur_sent_lens = stats.ref_lens[name]
        _ = cur_ax.hist(cur_sent_lens, density=True, bins=25)
        _ = cur_ax.axvline(x=np.mean(cur_sent_lens), color='red', linewidth=3)
        cur_ax.set_title(f'Reference {name} Length')
    plt.show()
Пример #2
0
def view_scores(references: PathOrPathsOrDictOfStrList,
                hypothesis: Optional[PathOrPathsOrDictOfStrList],
                metrics: List[str],
                tags: Optional[PathOrPathsOrDictOfStrList] = None):
    _ref = VizSeqDataSources(references)
    _hypo = VizSeqDataSources(hypothesis)
    _tags, tag_set = None, []
    if tags is not None:
        _tags = VizSeqDataSources(tags, text_merged=True)
        tag_set = sorted(_tags.unique())
        _tags = _tags.text
    models = _hypo.names
    all_metrics = get_scorer_ids()
    _metrics = []
    for s in metrics:
        if s in all_metrics:
            _metrics.append(s)
        else:
            logger.warn(f'"{s}" is not a valid metric.')

    scores = {
        s: {
            m: get_scorer(s)(corpus_level=True,
                             sent_level=False).score(_hypo.data[i].text,
                                                     _ref.text,
                                                     tags=_tags)
            for i, m in enumerate(models)
        }
        for s in _metrics
    }

    corpus_scores = {
        s: {m: scores[s][m].corpus_score
            for m in models}
        for s in _metrics
    }
    group_scores = {
        s: {
            t: {m: scores[s][m].group_scores[t]
                for m in models}
            for t in tag_set
        }
        for s in _metrics
    }

    metrics_and_names = [[s, get_scorer_name(s)] for s in _metrics]
    html = env.get_template('ipynb_scores.html').render(
        metrics_and_names=metrics_and_names,
        models=models,
        tag_set=tag_set,
        corpus_scores=corpus_scores,
        group_scores=group_scores,
        corpus_and_group_score_latex=VizSeqWebView.latex_corpus_group_scores(
            corpus_scores, group_scores),
        corpus_and_group_score_csv=VizSeqWebView.csv_corpus_group_scores(
            corpus_scores, group_scores),
    )
    return HTML(html)
Пример #3
0
def view_examples(
    sources: PathOrPathsOrDictOfStrList,
    references: PathOrPathsOrDictOfStrList,
    hypothesis: Optional[PathOrPathsOrDictOfStrList] = None,
    metrics: Optional[List[str]] = None,
    query: str = '',
    page_sz: int = DEFAULT_PAGE_SIZE,
    page_no: int = DEFAULT_PAGE_NO,
    sorting: VizSeqSortingType = VizSeqSortingType.original,
    need_g_translate: bool = False,
    disable_alignment: bool = False,
):
    _src = VizSeqDataSources(sources)
    _ref = VizSeqDataSources(references)
    _hypo = VizSeqDataSources(hypothesis)
    if _hypo.n_sources == 0:
        metrics = None
    assert len(_src) == len(_ref)
    assert _hypo.n_sources == 0 or len(_ref) == len(_hypo)

    _need_g_translate = need_g_translate and _src.has_text
    view = VizSeqDataPageView.get(_src,
                                  _ref,
                                  _hypo,
                                  page_sz,
                                  page_no,
                                  metrics=metrics,
                                  query=query,
                                  sorting=sorting.value,
                                  need_lang_tags=_need_g_translate,
                                  disable_alignment=disable_alignment)

    google_translation = []
    if _need_g_translate:
        for i, s in enumerate(view.cur_src_text):
            google_translation.append(get_g_translate(s, view.trg_lang[i]))

    html = env.get_template('ipynb_view.html').render(
        enum_metrics=VizSeqDataPageView.get_enum(metrics),
        enum_models=VizSeqDataPageView.get_enum(_hypo.text_names),
        cur_idx=view.cur_idx,
        src=view.viz_src,
        ref=view.viz_ref,
        hypo=view.viz_hypo,
        enum_src_names_and_types=VizSeqDataPageView.get_enum(
            zip(_src.names, [t.name for t in _src.data_types])),
        enum_ref_names=list(enumerate(_ref.names)),
        sent_scores=view.viz_sent_scores,
        google_translation=google_translation,
        span_highlight_js=SPAN_HIGHTLIGHT_JS,
        total_examples=view.total_examples,
        n_samples=view.n_samples,
        n_cur_samples=view.n_cur_samples,
    )
    return HTML(html)
Пример #4
0
def view_n_grams(data: PathOrPathsOrDictOfStrList, k: int = 64):
    _data = VizSeqDataSources(data, text_merged=True)
    n_grams = VizSeqNGrams.extract(_data, k=k)
    html = env.get_template('ipynb_n_grams.html').render(n=list(
        n_grams.keys()),
                                                         n_grams=n_grams)
    return HTML(html)
Пример #5
0
def get_scores(
    sources: PathOrPathsOrDictOfStrList,
    references: PathOrPathsOrDictOfStrList,
    model_to_hypotheses: PathOrPathsOrDictOfStrList,
    metrics: List[str],
    tags: Optional[PathOrPathsOrDictOfStrList] = None,
    verbose: bool = False,
    problem: str = None,
) -> Tuple[Dict, Dict]:
    # Copyright (c) Facebook, Inc. and its affiliates.
    # The code in this function is licensed under the MIT license.
    _srcs = VizSeqDataSources(sources)
    _refs = VizSeqDataSources(references)
    _hypos = VizSeqDataSources(model_to_hypotheses)
    _tags, tag_set = None, []
    if tags is not None:
        _tags = VizSeqDataSources(tags, text_merged=True)
        tag_set = sorted(_tags.unique())
        _tags = _tags.text
    models = _hypos.names
    all_metrics = get_scorer_ids()
    _metrics = []
    for s in metrics:
        if s in all_metrics:
            _metrics.append(s)
        else:
            logger.warning(f'"{s}" is not a valid metric.')

    def scorer_kwargs(s):
        kwargs = {"corpus_level": True, "sent_level": False, "verbose": verbose}
        if s in (
            "kendall_task_ranking",
            "req_cov",
            "essential_req_cov",
            "achievement",
            "granularity",
        ):
            # ProcGenScorer's
            kwargs["extra_args"] = {"problem": problem}
        return kwargs

    scores = {
        s: {
            m: get_scorer(s)(**scorer_kwargs(s)).score(
                _hypos.data[i].text, _refs.text, tags=_tags, sources=_srcs.text
            )
            for i, m in enumerate(models)
        }
        for s in _metrics
    }

    corpus_scores = {
        s: {m: scores[s][m].corpus_score for m in models} for s in _metrics
    }
    group_scores = {
        s: {t: {m: scores[s][m].group_scores[t] for m in models} for t in tag_set}
        for s in _metrics
    }

    return corpus_scores, group_scores
Пример #6
0
def __get_hypo(dir_path: str, models: str):
    if len(models) > 0:
        paths = [op.join(dir_path, f'pred_{m}.txt') for m in models.split(',')]
    else:
        paths = glob(op.join(dir_path, 'pred_*.txt'))
    return VizSeqDataSources(paths)
Пример #7
0
def _get_tag(dir_path: str):
    return VizSeqDataSources(
        sorted(glob(op.join(dir_path, 'tag_*.txt'))), text_merged=True
    )
Пример #8
0
def _get_ref(dir_path: str):
    return VizSeqDataSources(sorted(glob(op.join(dir_path, 'ref_*.txt'))))
Пример #9
0
def _get_src(dir_path: str):
    return VizSeqDataSources(sorted(glob(op.join(dir_path, 'src_*.*'))))
Пример #10
0
    def get(
        cls,
        src: VizSeqDataSources,
        ref: VizSeqDataSources,
        hypo: VizSeqDataSources,
        page_sz: int,
        page_no: int,
        metrics: Optional[List[str]] = None,
        query: str = '',
        sorting: int = 0,
        sorting_metric: str = '',
        need_lang_tags: bool = False,
        disable_alignment: bool = False,
    ) -> VizSeqPageData:
        assert page_no > 0 and page_sz > 0
        page_sz = min(page_sz, MAX_PAGE_SZ)
        metrics = [] if metrics is None else metrics
        models = hypo.text_names
        # query
        cur_idx = list(range(len(src)))
        if src.has_text:
            cur_idx = VizSeqFilter.filter(src.text, query)
        elif ref.has_text:
            cur_idx = VizSeqFilter.filter(ref.text, query)
        n_samples = len(cur_idx)

        # sorting
        sorting = {e.value: e for e in VizSeqSortingType}.get(sorting, None)
        assert sorting is not None
        if sorting == VizSeqSortingType.random:
            cur_idx = VizSeqRandomSorter.sort(cur_idx)
        elif sorting == VizSeqSortingType.ref_len:
            cur_idx = VizSeqByLenSorter.sort(ref.main_text, cur_idx)
        elif sorting == VizSeqSortingType.ref_alphabetical:
            cur_idx = VizSeqByStrOrderSorter.sort(ref.main_text, cur_idx)
        elif sorting == VizSeqSortingType.src_len:
            if src.has_text:
                cur_idx = VizSeqByLenSorter.sort(src.main_text, cur_idx)
        elif sorting == VizSeqSortingType.src_alphabetical:
            if src.has_text:
                cur_idx = VizSeqByStrOrderSorter.sort(src.main_text, cur_idx)
        elif sorting == VizSeqSortingType.metric:
            if sorting_metric in get_scorer_ids():
                _cur_ref = [_select(t, cur_idx) for t in ref.text]
                scores = {
                    m: get_scorer(sorting_metric)(corpus_level=False,
                                                  sent_level=True).score(
                                                      _select(t, cur_idx),
                                                      _cur_ref).sent_scores
                    for m, t in zip(models, hypo.text)
                }
                scores = [{m: scores[m][i]
                           for m in models} for i in range(len(cur_idx))]
                cur_idx = VizSeqByMetricSorter.sort(scores, cur_idx)

        # pagination
        start_idx, end_idx = _get_start_end_idx(len(cur_idx), page_sz, page_no)
        cur_idx = cur_idx[start_idx:end_idx + 1]
        n_cur_samples = len(cur_idx)

        # page data
        cur_src = src.cached(cur_idx)
        cur_src_text = _select(src.main_text,
                               cur_idx) if src.has_text else None
        cur_ref = [_select(t, cur_idx) for t in ref.text]
        cur_hypo = {n: _select(t, cur_idx) for n, t in zip(models, hypo.text)}

        # sent scores
        cur_sent_scores = {
            s: {
                m: np.round(get_scorer(s)(corpus_level=False,
                                          sent_level=True).score(
                                              hh, cur_ref).sent_scores,
                            decimals=2)
                for m, hh in cur_hypo.items()
            }
            for s in metrics
        }

        # rendering
        viz_src = cur_src
        if not disable_alignment:
            viz_src = VizSeqSrcVisualizer.visualize(cur_src, src.text_indices)
        viz_ref = cur_ref
        if not disable_alignment and cur_src_text is not None:
            viz_ref = VizSeqRefVisualizer.visualize(cur_src_text, cur_ref,
                                                    src.main_text_idx)
        viz_hypo = cur_hypo
        if not disable_alignment:
            viz_hypo = VizSeqHypoVisualizer.visualize(cur_ref[0], cur_hypo, 0)
        viz_sent_scores = [{
            s: VizSeqDictVisualizer.visualize(
                {m: cur_sent_scores[s][m][i]
                 for m in models})
            for s in metrics
        } for i in range(n_cur_samples)]

        trg_lang = None
        if need_lang_tags:
            trg_lang = [VizSeqLanguageTagger.tag_lang(r) for r in cur_ref[0]]

        return VizSeqPageData(
            viz_src=viz_src,
            viz_ref=viz_ref,
            viz_hypo=viz_hypo,
            cur_src=cur_src,
            cur_src_text=cur_src_text,
            cur_ref=cur_ref,
            cur_idx=cur_idx,
            viz_sent_scores=viz_sent_scores,
            trg_lang=trg_lang,
            n_cur_samples=n_cur_samples,
            n_samples=n_samples,
            total_examples=len(src),
        )