示例#1
0
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError("Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowCharSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME + "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowCharSpanType.ENDS_NAME + "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(ArrowCharSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(ArrowCharSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the CharSpanArray, then the TokenSpanArray
    char_span = CharSpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
示例#2
0
def _make_sentences_series(spacy_doc, tokens: CharSpanArray):
    """
    Subroutine of `make_tokens_and_features()`

    :param spacy_doc: parsed document (`spacy.tokens.doc.Doc`) from a spaCy language
     model

    :param tokens: Token information for the current document as a
    `CharSpanArray` object. Must contain the same tokens as `spacy_doc`.

    :return: a Pandas DataFrame Series containing the token span of the (single)
    sentence that the token is in
    """
    num_toks = len(spacy_doc)
    # Generate the [begin, end) intervals that make up a series of spans
    begin_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int)
    end_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int)
    for sent in spacy_doc.sents:
        begin_tokens[sent.start:sent.end] = sent.start
        end_tokens[sent.start:sent.end] = sent.end
    return pd.Series(TokenSpanArray(tokens, begin_tokens, end_tokens))
def spans_to_iob(
    token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series]
) -> pd.Series:
    """
    Convert a series of `TokenSpan`s of entities to token tags in
    Inside–Outside–Beginning (IOB2) format. See
    https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
    for more information on IOB2 format.

    :param token_spans: An object that can be converted to a `TokenSpanArray` via
        `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the
        target tokenization.
        Usually you create this array by calling `TokenSpanArray.align_to_tokens()`.
    :return: A `pd.Series` of IOB2 tags as strings and a series name of "ent_iob".
    """
    token_spans = TokenSpanArray.make_array(token_spans)

    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)

    # Handle an empty token span array
    if len(token_spans) == 0:
        return pd.Series(dtype=iob2_dtype)

    # Initialize an IOB series with all 'O' entities
    iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64)
    iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype)

    # Assign the begin entities
    iob_tags[token_spans.begin_token] = "B"

    # Fill in the remaining inside entities
    i_lengths = token_spans.end_token - (token_spans.begin_token + 1)
    i_mask = i_lengths > 0
    i_begins = token_spans.begin_token[i_mask] + 1
    i_ends = token_spans.end_token[i_mask]
    for begin, end in zip(i_begins, i_ends):
        iob_tags[begin:end] = "I"

    return pd.Series(iob_tags, name="ent_iob")
def iob_to_spans(
    token_features: pd.DataFrame,
    iob_col_name: str = "ent_iob",
    char_span_col_name: str = "char_span",
    entity_type_col_name: str = "ent_type",
):
    """
    Convert token tags in Inside–Outside–Beginning (IOB2) format to a series of
    `TokenSpan`s of entities. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
    for more information on IOB2 format.

    :param token_features: DataFrame of token features in the format returned by
     `make_tokens_and_features`.
    :param iob_col_name: Name of a column in `token_features` that contains the
     IOB2 tags as strings, "I", "O", or "B".
    :param char_span_col_name: Name of a column in `token_features` that
     contains the tokens as a `CharSpanArray`.
    :param entity_type_col_name: Optional name of a column in `token_features`
     that contains entity type information; or `None` if no such column exists.
    :return: A `pd.DataFrame` with the following columns:
    * `token_span`: Span (with token offsets) of each entity
    * `<value of entity_type_col_name>`: (optional) Entity type
    """
    # Start out with 1-token prefixes of all entities.
    begin_mask = token_features[iob_col_name] == "B"
    first_tokens = token_features[begin_mask].index
    if entity_type_col_name is None:
        entity_types = np.zeros(len(first_tokens))
    else:
        entity_types = token_features[begin_mask][entity_type_col_name]

    # Add an extra "O" tag to the end of the IOB column to simplify the logic
    # for handling the case where the document ends with an entity.
    iob_series = (
        token_features[iob_col_name].append(pd.Series(["O"])).reset_index(drop=True)
    )

    entity_prefixes = pd.DataFrame(
        {
            "ent_type": entity_types,
            "begin": first_tokens,  # Inclusive
            "end": first_tokens + 1,  # Exclusive
            "next_tag": iob_series.iloc[first_tokens + 1].values,
        }
    )

    df_list = []  # Type: pd.DataFrame

    if len(entity_prefixes.index) == 0:
        # Code below needs at least one element in the list for schema
        df_list = [entity_prefixes]

    # Iteratively expand the prefixes
    while len(entity_prefixes.index) > 0:
        complete_mask = entity_prefixes["next_tag"].isin(["O", "B"])
        complete_entities = entity_prefixes[complete_mask]
        incomplete_entities = entity_prefixes[~complete_mask].copy()
        incomplete_entities["end"] = incomplete_entities["end"] + 1
        incomplete_entities["next_tag"] = iob_series.iloc[
            incomplete_entities["end"]
        ].values
        df_list.append(complete_entities)
        entity_prefixes = incomplete_entities
    all_entities = pd.concat(df_list)

    # Sort spans by location, not length.
    all_entities.sort_values("begin", inplace=True)

    # Convert [begin, end) pairs to spans
    entity_spans_array = TokenSpanArray(
        token_features[char_span_col_name].values,
        all_entities["begin"].values,
        all_entities["end"].values,
    )
    if entity_type_col_name is None:
        return pd.DataFrame({"token_span": entity_spans_array})
    else:
        return pd.DataFrame(
            {
                "token_span": entity_spans_array,
                entity_type_col_name: all_entities["ent_type"].values,
            }
        )
def _doc_to_df(doc: List[Dict[str, List[str]]],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: Tree of Python objects that represents the document,
     List with one dictionary per sentence.
    :param space_before_punct: If `True`, add whitespace before
     punctuation characters when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `char_span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `token_span`: Span of each token, with token offsets.
      Backed by the contents of the `char_span` column.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    """
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]
    sentences_list = []  # Type: List[np.ndarray]
    iobs_list = []  # Type: List[np.ndarray]
    entities_list = []  # Type: List[np.ndarray]
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    char_position = 0
    token_position = 0
    for sentence in doc:
        tokens = sentence["token"]

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_mask = (
            np.zeros(len(tokens), dtype=np.bool) if space_before_punct
            else _PUNCT_MATCH_FN(tokens))
        no_space_mask[0] = True  # No space before first token
        prefixes = np.where(no_space_mask, "", " ")
        string_parts = np.ravel((prefixes, tokens), order='F')
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        iobs = np.array(sentence["iob"])
        entities = np.array(sentence["entity"])
        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))

        begins_list.append(b + char_position)
        ends_list.append(e + char_position)
        iobs_list.append(iobs)
        entities_list.append(entities)
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = CharSpanArray(doc_text, begins, ends)
    token_begins = np.arange(len(begins))
    token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))
    return pd.DataFrame(
        {"char_span": char_spans,
         "token_span": token_spans,
         "ent_iob": np.concatenate(iobs_list),
         "ent_type": np.concatenate(entities_list),
         "sentence": sentence_spans})
def spans_to_iob(
    token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series],
    span_ent_types: Union[str, Iterable, np.ndarray, pd.Series] = None
) -> pd.DataFrame:
    """
    Convert a series of `TokenSpan`s of entities to token tags in
    Inside–Outside–Beginning (IOB2) format. See
    https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
    for more information on IOB2 format.

    :param token_spans: An object that can be converted to a `TokenSpanArray` via
        `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the
        target tokenization.
        Usually you create this array by calling `TokenSpanArray.align_to_tokens()`.
    :param span_ent_types: List of entity type strings corresponding to each of the
        elements of `token_spans`, or `None` to indicate null entity tags.
    :return: A `pd.DataFrame` with two columns:
      * "ent_iob": IOB2 tags as strings "ent_iob"
      * "ent_type": Entity type strings (or NaN values if `ent_types` is `None`)
    """
    # Normalize inputs
    token_spans = TokenSpanArray.make_array(token_spans)
    if span_ent_types is None:
        span_ent_types = [None] * len(token_spans)
    elif isinstance(span_ent_types, str):
        span_ent_types = [span_ent_types] * len(token_spans)
    elif isinstance(span_ent_types, pd.Series):
        span_ent_types = span_ent_types.values

    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)

    # Handle an empty token span array
    if len(token_spans) == 0:
        return pd.DataFrame({
            "ent_iob": pd.Series(dtype=iob2_dtype),
            "ent_type": pd.Series(dtype="string")
        })

    # Initialize an IOB series with all 'O' entities
    iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64)
    iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype)

    # Assign the begin tags
    iob_tags[token_spans.begin_token] = "B"

    # Fill in the remaining inside tags
    i_lengths = token_spans.end_token - (token_spans.begin_token + 1)
    i_mask = i_lengths > 0
    i_begins = token_spans.begin_token[i_mask] + 1
    i_ends = token_spans.end_token[i_mask]
    for begin, end in zip(i_begins, i_ends):
        iob_tags[begin:end] = "I"

    # Use a similar process to generate entity type tags
    ent_types = np.full(len(token_spans.tokens), None, dtype=object)
    for ent_type, begin, end in zip(span_ent_types, token_spans.begin_token,
                                    token_spans.end_token):
        ent_types[begin:end] = ent_type

    return pd.DataFrame({
        "ent_iob": iob_tags,
        "ent_type": pd.Series(ent_types, dtype="string")
    })
def _doc_to_df(doc: List[_SentenceData], column_names: List[str],
               iob_columns: List[bool],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: List of Python objects that represents the document.
    :param column_names: Names for the metadata columns that come after the
     token text. These names will be used to generate the names of the dataframe
     that this function returns.
    :param iob_columns: Mask indicating which of the metadata columns after the
     token text should be treated as being in IOB format. If a column is in IOB format,
     the returned dataframe will contain *two* columns, holding IOB2 tags and
     entity type tags, respectively. For example, an input column "ent" will turn into
     output columns "ent_iob" and "ent_type".

    :param space_before_punct: If `True`, add whitespace before
     punctuation characters (and after left parentheses)
     when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `char_span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `token_span`: Span of each token, with token offsets.
      Backed by the contents of the `char_span` column.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    """

    # Character offsets of tokens in the reconstructed document
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]

    # Reconstructed text of each sentence
    sentences_list = []  # Type: List[np.ndarray]

    # Token offsets of sentences containing each token in the document.
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    # Token metadata column values. Key is column name, value is metadata for
    # each token.
    meta_lists = _make_empty_meta_values(column_names, iob_columns)

    char_position = 0
    token_position = 0
    for sentence_num in range(len(doc)):
        sentence = doc[sentence_num]
        tokens = sentence.tokens

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool)
                                if space_before_punct else
                                _SPACE_BEFORE_MATCH_FN(tokens))
        no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool)
                               if space_before_punct else
                               _SPACE_AFTER_MATCH_FN(tokens))
        no_space_before_mask[0] = True  # No space before first token
        no_space_after_mask[-1] = True  # No space after last token
        shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
        prefixes = np.where(
            np.logical_or(no_space_before_mask, shifted_no_space_after_mask),
            "", " ")
        string_parts = np.ravel((prefixes, tokens), order="F")
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        begins_list.append(b + char_position)
        ends_list.append(e + char_position)

        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        for k in sentence.token_metadata.keys():
            meta_lists[k].extend(sentence.token_metadata[k])

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = CharSpanArray(doc_text, begins, ends)
    token_begins = np.arange(len(begins))
    token_spans = TokenSpanArray(char_spans, token_begins, token_begins + 1)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))

    ret = pd.DataFrame({"char_span": char_spans, "token_span": token_spans})
    for k, v in meta_lists.items():
        ret[k] = v
    ret["sentence"] = sentence_spans
    return ret
示例#8
0
def make_tokens_and_features(
    target_text: str,
    language_model,
    add_left_and_right=False,
) -> pd.DataFrame:
    """
    :param target_text: Text to analyze

    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
     object

    :param add_left_and_right: If `True`, add columns "left" and "right"
    containing references to previous and next tokens.

    :return: A tuple of two dataframes:
    1. The tokens of the text plus additional linguistic features that the
       language model generates, represented as a `pd.DataFrame`.
    2. A table of named entities identified by the language model's named entity
       tagger, represented as a `pd.DataFrame`.
    """
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = CharSpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.values)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id":
        range(len(tok_begins)),
        "char_span":
        tokens_series,
        "token_span":
        token_spans,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos":
        pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag":
        pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep":
        pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head":
        np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape":
        pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob":
        pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype),
        "ent_type":
        pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha":
        np.array([t.is_alpha for t in spacy_doc]),
        "is_stop":
        np.array([t.is_stop for t in spacy_doc]),
        "sentence":
        _make_sentences_series(spacy_doc, tokens_array),
    }
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array([None] + list(range(len(tok_begins) - 1)),
                                   dtype=pd.Int32Dtype())
        df_cols["right"] = pd.array(list(range(1, len(tok_begins))) + [None],
                                    dtype=pd.Int32Dtype())
    return pd.DataFrame(df_cols)
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame:
    """
    Tokenize the indicated text for BERT embeddings and return a DataFrame
    with one row per token.

    :param: target_text: string to tokenize
    :param: tokenizer: A tokenizer that is a subclass of huggingface transformers
                       PreTrainingTokenizerFast which supports `encode_plus` with
                       return_offsets_mapping=True.

    :returns: A `pd.DataFrame` with the following columns:
     * "id": unique integer ID for each token
     * "char_span": span of the token with character offsets
     * "token_span": span of the token with token offsets
     * "input_id": integer ID suitable for input to a BERT embedding model
     * "token_type_id": list of token type ids to be fed to a model
     * "attention_mask": list of indices specifying which tokens should be
                         attended to by the model
     * "special_tokens_mask": `True` if the token is a zero-length special token
       such as "start of document"
    """
    from transformers.tokenization_utils import PreTrainedTokenizerFast

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise TypeError("Tokenizer must be an instance of "
                        "transformers.PreTrainedTokenizerFast that supports "
                        "encode_plus with return_offsets_mapping=True.")
    tokenized_result = tokenizer.encode_plus(target_text,
                                             return_special_tokens_mask=True,
                                             return_offsets_mapping=True)

    # Get offset mapping from tokenizer
    offsets = tokenized_result["offset_mapping"]

    # Init any special tokens at beginning
    i = 0
    while offsets[i] is None:
        offsets[i] = (0, 0)
        i += 1

    # Make a DataFrame to unzip (begin, end) offsets
    offset_df = pd.DataFrame(offsets, columns=["begin", "end"])

    # Convert special tokens mask to boolean
    special_tokens_mask = pd.Series(
        tokenized_result["special_tokens_mask"]).astype("bool")

    # Fill remaining special tokens to zero-length spans
    ends = offset_df["end"].fillna(method="ffill").astype("int32")
    begins = offset_df["begin"].mask(special_tokens_mask,
                                     other=ends).astype("int32")

    # Create char and token span arrays
    char_spans = CharSpanArray(target_text, begins, ends)
    token_spans = TokenSpanArray(char_spans, np.arange(len(char_spans)),
                                 np.arange(1,
                                           len(char_spans) + 1))

    token_features = pd.DataFrame({
        "id":
        special_tokens_mask.index,
        # Use values instead of series because different indexes
        "char_span":
        pd.Series(char_spans).values,
        "token_span":
        pd.Series(token_spans).values,
        "input_id":
        tokenized_result["input_ids"],
        "token_type_id":
        tokenized_result["token_type_ids"],
        "attention_mask":
        tokenized_result["attention_mask"],
        "special_tokens_mask":
        special_tokens_mask,
    })

    return token_features