Exemplo n.º 1
0
def align_bert_tokens_to_corpus_tokens(
        spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame) -> pd.DataFrame:
    """
    Expand entity matches from a BERT-based model so that they align
    with the corpus's original tokenization.

    :param spans_df: DataFrame of extracted entities. Must contain two
     columns: "span" and "ent_type". Other columns ignored.
    :param corpus_toks_df: DataFrame of the corpus's original tokenization,
     one row per token.
     Must contain a column "span" with character-based spans of
     the tokens.

    :returns: A new DataFrame with schema ["span", "ent_type"],
     where the "span" column contains token-based spans based off
     the *corpus* tokenization in `corpus_toks_df["span"]`.
    """
    if len(spans_df.index) == 0:
        return spans_df.copy()
    overlaps_df = (spanner.overlap_join(spans_df["span"],
                                        corpus_toks_df["span"], "span",
                                        "corpus_token").merge(spans_df))
    agg_df = (overlaps_df.groupby("span").aggregate({
        "corpus_token": "sum",
        "ent_type": "first"
    }).reset_index())
    cons_df = (spanner.consolidate(agg_df, "corpus_token")[[
        "corpus_token", "ent_type"
    ]].rename(columns={"corpus_token": "span"}))
    cons_df["span"] = TokenSpanArray.align_to_tokens(corpus_toks_df["span"],
                                                     cons_df["span"])
    return cons_df
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]],
              token_features: pd.DataFrame,
              lemma_col_name: str = "lemma",
              token_span_col_name: str = "span") -> List[str]:
    """
    Convert spans to their normal form using lemma information in a token
    features table.

    :param spans: Spans to be normalized. Each may represent zero or more
    tokens.

    :param token_features: DataFrame of token metadata. Index must be aligned
    with the token indices in `spans`.

    :param lemma_col_name: Optional custom name for the DataFrame column
    containing the lemmatized form of each token.

    :param token_span_col_name: Optional custom name for the DataFrame column
    containing the span of each token.

    :return: A list containing normalized versions of the tokens
    in `spans`, with each token separated by single space character.
    """
    char_spans = SpanArray.make_array(spans)
    token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name],
                                                 char_spans)
    ret = []  # Type: List[str]
    # TODO: Vectorize this loop
    for i in range(len(token_spans)):
        lemmas = token_features[lemma_col_name][
                 token_spans.begin_token[i]:token_spans.end_token[i]
                 ]
        ret.append(" ".join(lemmas))
    return ret
Exemplo n.º 3
0
def align_bert_tokens_to_corpus_tokens(
    spans_df: pd.DataFrame,
    corpus_toks_df: pd.DataFrame,
    spans_df_token_col: str = "span",
    corpus_df_token_col: str = "span",
    entity_type_col: str = "ent_type",
) -> pd.DataFrame:
    """
    Expand entity matches from a BERT-based model so that they align
    with the corpus's original tokenization.

    :param spans_df: DataFrame of extracted entities. Must contain two
     columns with span and entity type information, respectively. Other columns ignored.
    :param corpus_toks_df: DataFrame of the corpus's original tokenization,
     one row per token.
     Must contain a column with character-based spans of
     the tokens.
    :param spans_df_token_col: the name of the column in ``spans_df``
     containing its tokenization. By default, ``'span'``
    :param corpus_df_token_col: the name of the column in ``corpus_toks_df``
     that contains its tokenization. By default ```'span'``
    :param entity_type_col: the name of the column in spans_df that
     contains the entity types of the elements


    :returns: A new DataFrame with schema ``["span", "ent_type"]``,
     where the "span" column contains token-based spans based off
     the *corpus* tokenization in ``corpus_toks_df["span"]``.
    """
    if len(spans_df.index) == 0:
        return spans_df.copy()

    overlaps_df = spanner.overlap_join(
        spans_df[spans_df_token_col],
        corpus_toks_df[corpus_df_token_col],
        "span",
        "corpus_token",
    ).merge(spans_df, left_on="span", right_on=spans_df_token_col)
    agg_df = (overlaps_df.groupby("span").aggregate({
        "corpus_token": "sum",
        entity_type_col: "first"
    }).reset_index())
    cons_df = spanner.consolidate(agg_df, "corpus_token")[[
        "corpus_token", entity_type_col
    ]].rename(columns={"corpus_token": "span"})
    cons_df["span"] = TokenSpanArray.align_to_tokens(
        corpus_toks_df[corpus_df_token_col], cons_df["span"])
    return cons_df
Exemplo n.º 4
0
def conll_to_bert(
    df: pd.DataFrame,
    tokenizer: Any,
    bert: Any,
    token_class_dtype: pd.CategoricalDtype,
    compute_embeddings: bool = True,
    overlap: int = 32,
    non_overlap: int = 64,
) -> pd.DataFrame:
    """
    :param df: One DataFrame from the :func:`conll_2003_to_dataframes` function,
     representing the tokens of a single document in the original tokenization.
    :param tokenizer: BERT tokenizer instance from the `transformers` library
    :param bert: PyTorch-based BERT model from the `transformers` library
    :param token_class_dtype: Pandas categorical type for representing
     token class labels, as returned by :func:`make_iob_tag_categories`
    :param compute_embeddings: True to generate BERT embeddings at each token
     position and add a column "embedding" to the returned DataFrame with
     the embeddings
    :param overlap: (optional) how much overlap there should be between adjacent
     windows for embeddings
    :param non_overlap: (optional) how much non-overlapping content between the
     overlapping regions there should be at the middle of each window?

    :returns: A version of the same DataFrame, but with BERT tokens, BERT
     embeddings for each token (if ``compute_embeddings`` is ``True``),
     and token class labels.
    """
    spans_df = conll.iob_to_spans(df)
    bert_toks_df = make_bert_tokens(df["span"].values[0].target_text,
                                    tokenizer)
    bert_token_spans = TokenSpanArray.align_to_tokens(bert_toks_df["span"],
                                                      spans_df["span"])
    bert_toks_df[["ent_iob",
                  "ent_type"]] = conll.spans_to_iob(bert_token_spans,
                                                    spans_df["ent_type"])
    bert_toks_df = conll.add_token_classes(bert_toks_df, token_class_dtype)
    if compute_embeddings:
        bert_toks_df = add_embeddings(bert_toks_df, bert, overlap, non_overlap)
    return bert_toks_df
Exemplo n.º 5
0
def _make_syntax_dataframes(syntax_response, original_text):
    tokens = syntax_response.get("tokens", [])
    sentence = syntax_response.get("sentences", [])

    if len(tokens) > 0:
        token_table = util.make_table(tokens)
        location_col, location_name = util.find_column(token_table, "location")
        text_col, text_name = util.find_column(token_table, "text")
        char_span = util.make_char_span(location_col, text_col, original_text)

        # Drop location, text columns that is duplicated in char_span
        token_table = token_table.drop([location_name, text_name])

        # Add the span columns to the DataFrames
        token_df = token_table.to_pandas()
        token_df['span'] = char_span
    else:
        char_span = None
        token_df = pd.DataFrame()

    if len(sentence) > 0:
        sentence_table = util.make_table(sentence)
        sentence_df = sentence_table.to_pandas()
        if char_span is not None:
            location_col, _ = util.find_column(sentence_table, "location")
            text_col, _ = util.find_column(sentence_table, "text")
            sentence_char_span = util.make_char_span(location_col, text_col,
                                                     original_text)
            sentence_span = TokenSpanArray.align_to_tokens(
                char_span, sentence_char_span)
            sentence_df['span'] = sentence_char_span
            sentence_df['sentence_span'] = sentence_span
    else:
        sentence_df = pd.DataFrame()

    return token_df, sentence_df