Пример #1
def extract_split(
        doc_text: str, split_points: Union[Sequence[int], np.ndarray,
                                           SpanArray]) -> SpanArray:
    Split a document into spans along a specified set of split points.

    :param doc_text: Text of the document; will be the target text of the returned spans.

    :param split_points: A series of offsets into ``doc_text``, expressed as either:
      * A sequence of integers (split at certain locations and return a set of splits that
        covers every character in the document) as a list or 1-d Numpy array
      * A sequence of spans (split around the indicated locations, but discard the parts
        of the document that are within a split point)

    :returns: An ``SpanArray``  that splits the document in the specified way.
    if isinstance(split_points, (collections.abc.Sequence, np.ndarray)):
        # Single-integer split points ==> zero-length spans
        split_points = SpanArray(doc_text, split_points, split_points)
    elif not isinstance(split_points, SpanArray):
        raise TypeError(
            f"Split points are of type {type(split_points)}. Expected a "
            f"sequence of integers or a SpanArray.")

    # Make sure split points are in order
    sorted_indices = split_points.argsort()
    sorted_split_points = split_points[sorted_indices]

    # Break out the split points.
    split_begins = sorted_split_points.begin.tolist()  # type: List[int]
    split_ends = sorted_split_points.end.tolist()  # type: List[int]

    # Tack on an additional split point at the very end to simplify the logic below.

    # Walk through the document, generating the begin and end offsets of spans
    begins = []
    ends = []
    begin = 0
    for split_begin, split_end in zip(split_begins, split_ends):
        end = split_begin
        if end > begin:  # Ignore zero-length and negative-length chunks
        begin = split_end

    return SpanArray(doc_text, begins, ends)
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray(target_text, begins, ends)
Пример #3
def make_tokens_and_features(
    target_text: str, language_model, add_left_and_right=False,
) -> pd.DataFrame:
    :param target_text: Text to analyze
    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
    :param add_left_and_right: If ``True``, add columns "left" and "right"
     containing references to previous and next tokens.

    :return: A tuple of two dataframes:

             1. The tokens of the text plus additional linguistic features that the
                language model generates, represented as a `pd.DataFrame`.
             2. A table of named entities identified by the language model's named entity
                tagger, represented as a `pd.DataFrame`.
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = SpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.array)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id": range(len(tok_begins)),
        "span": tokens_series,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape": pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc],
        "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha": np.array([t.is_alpha for t in spacy_doc]),
        "is_stop": np.array([t.is_stop for t in spacy_doc]),
        "sentence": _make_sentences_series(spacy_doc, tokens_array),
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array(
            [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
        df_cols["right"] = pd.array(
            list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
    return pd.DataFrame(df_cols)
Пример #4
def make_tokens(target_text: str, tokenizer) -> pd.Series:
    :param target_text: Text to tokenize
    :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object
    :return: The tokens (and underlying text) as a Pandas Series wrapped around
        a `SpanArray` value.
    spacy_doc = tokenizer(target_text)
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
Пример #5
def _make_entity_mentions_dataframe(
        entities: List, original_text: str,
        apply_standard_schema: bool) -> pd.DataFrame:
    Unroll the records of the "mentions" element of NLU entities into a flat
    DataFrame. Schema of this DataFrame is `_entity_mentions_schema`
    :param entities: The "entities" section of a parsed NLU response
    :param original_text: Text of the document.  This argument must be provided if there
     are entity mention spans.
    :param apply_standard_schema: Value of the eponymous argument from `parse_response`.
    if 0 == len(entities) or "mentions" not in entities[0].keys():
        # No mentions to unroll. Return an empty DataFrame.
        return util.apply_schema(
            pd.DataFrame(columns=[e[0] for e in _entity_mentions_schema]),
            _entity_mentions_schema, apply_standard_schema)
    if original_text is None:
        raise ValueError(
            "Unable to construct target text for converting entity mentions to spans"
    # Explode out the nested relations containing entity location information.
    # If there was a version of DataFrame.explode() that could handle structs,
    # we would be able to vectorize this operation.
    # Instead we build up the values one row at a time.
    # Some columns come from "parent" entity records, and some columns come from the
    # "child" entity mention records.
    num_parent_cols = len(_entity_mentions_parent_elems)
    parent_cols = [[] for i in range(num_parent_cols)]
    begins = []
    ends = []
    confidences = []
    for e in entities:
        for m in e["mentions"]:
            for i in range(num_parent_cols):
                parent_elem = e[_entity_mentions_parent_names[i]]
                m["confidence"])  # N.B. confidence of mention, not entity
    # Construct columns, then convert to a DataFrame
    df_cols = {
        _entity_mentions_parent_names[i]: parent_cols[i]
        for i in range(len(_entity_mentions_parent_names))
    df_cols["span"] = SpanArray(original_text, begins, ends)
    df_cols["confidence"] = confidences
    return util.apply_schema(pd.DataFrame(df_cols), _entity_mentions_schema,
Пример #6
def make_tokens(target_text: str, tokenizer: "spacy.tokenizer.Tokenizer" = None) \
    -> pd.Series:
    :param target_text: Text to tokenize
    :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object, or None
     to use the tokenizer returned by :func:`simple_tokenizer()`
    :return: The tokens (and underlying text) as a Pandas Series wrapped around
        a `SpanArray` value.
    if tokenizer is None:
        tokenizer = simple_tokenizer()
    spacy_doc = tokenizer(target_text)
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
Пример #7
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    ..NOTE: Only supported with PyArrow >= 2.0.0

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for SpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    # NOTE: workaround for bug in parquet reading
    if pa.types.is_struct(extension_array.type):
        index_dtype = extension_array.field(ArrowSpanType.BEGINS_NAME).type
        target_text_dict_dtype = extension_array.field(
        extension_array = pa.ExtensionArray.from_storage(
            ArrowSpanType(index_dtype, target_text_dict_dtype),

    assert pa.types.is_struct(extension_array.storage.type)

    # Create target text StringTable and text_ids from dictionary array
    target_text_dict_array = extension_array.storage.field(
    table_texts = target_text_dict_array.dictionary.to_pylist()
    string_table = StringTable.from_things(table_texts)
    text_ids = target_text_dict_array.indices.to_numpy()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray((string_table, text_ids), begins, ends)
Пример #8
def extract_regex(
    doc_text: str,
    compiled_regex: "re.Pattern"  # Double quotes for Python 3.6 compatibility
    Identify all non-overlapping matches of a regular expression, as returned by
    ``re.Pattern.finditer()``, and return those locations as an array of spans.

    :param doc_text: Text of the document; will be the target text of the returned spans.

    :param compiled_regex: Regular expression to evaluate, compiled with either the ``re``
      or the ``regex`` package.

    :returns: A ``SpanArray`` containing a span for each match of the regex.
    begins = []
    ends = []
    for a in compiled_regex.finditer(doc_text):

    return SpanArray(doc_text, begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(
    token_ends_array = extension_array.storage.field(

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
                extension_array.storage.field(ArrowSpanType.BEGINS_NAME +
                extension_array.storage.field(ArrowSpanType.ENDS_NAME +
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
        char_begins_array = extension_array.storage.field(
        char_ends_array = extension_array.storage.field(

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the SpanArray, then the TokenSpanArray
    char_span = SpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
Пример #10
            return Span.__lt__(self, other)

    def tokens(self):
        return self._tokens

    def begin_token(self):
        return self._begin_token

    def end_token(self):
        return self._end_token

_EMPTY_SPAN_ARRAY_SINGLETON = SpanArray("", [], [])


class TokenSpanDtype(SpanDtype):
    Pandas datatype for a span that represents a range of tokens within a
    target string.
    def type(self):
        # The type for a single row of a column of type TokenSpan
Пример #11
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame:
    Tokenize the indicated text for BERT embeddings and return a DataFrame
    with one row per token.

    :param: target_text: string to tokenize
    :param: tokenizer: A tokenizer that is a subclass of huggingface transformers
                       PreTrainingTokenizerFast which supports `encode_plus` with

    :returns: ``pd.DataFrame`` with following columns:

              * "id": unique integer ID for each token
              * "span": span of the token (with offsets measured in characters)
              * "input_id": integer ID suitable for input to a BERT embedding model
              * "token_type_id": list of token type ids to be fed to a model
              * "attention_mask": list of indices specifying which tokens should be
                attended to by the model
              * "special_tokens_mask": `True` if the token is a zero-length special token
                such as "start of document"

    # noinspection PyPackageRequirements
    from transformers import PreTrainedTokenizerFast

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise TypeError("Tokenizer must be an instance of "
                        "transformers.PreTrainedTokenizerFast that supports "
                        "encode_plus with return_offsets_mapping=True.")
    tokenized_result = tokenizer.encode_plus(target_text,

    # Get offset mapping from tokenizer
    offsets = tokenized_result["offset_mapping"]

    # Init any special tokens at beginning
    i = 0
    while offsets[i] is None:
        offsets[i] = (0, 0)
        i += 1

    # Make a DataFrame to unzip (begin, end) offsets
    offset_df = pd.DataFrame(offsets, columns=["begin", "end"])

    # Convert special tokens mask to boolean
    special_tokens_mask = pd.Series(

    # Fill remaining special tokens to zero-length spans
    ends = offset_df["end"].fillna(method="ffill").astype("int32")
    begins = offset_df["begin"].mask(special_tokens_mask,

    spans = SpanArray(target_text, begins, ends)

    token_features = pd.DataFrame({

    return token_features
Пример #12
def _doc_to_df(doc: List[_SentenceData], column_names: List[str],
               iob_columns: List[bool],
               space_before_punct: bool) -> pd.DataFrame:
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: List of Python objects that represents the document.
    :param column_names: Names for the metadata columns that come after the
     token text. These names will be used to generate the names of the dataframe
     that this function returns.
    :param iob_columns: Mask indicating which of the metadata columns after the
     token text should be treated as being in IOB format. If a column is in IOB format,
     the returned dataframe will contain *two* columns, holding IOB2 tags and
     entity type tags, respectively. For example, an input column "ent" will turn into
     output columns "ent_iob" and "ent_type".
    :param space_before_punct: If `True`, add whitespace before
     punctuation characters (and after left parentheses)
     when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    * `line_num`: line number of each token in the parsed file

    # Character offsets of tokens in the reconstructed document
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]

    # Reconstructed text of each sentence
    sentences_list = []  # Type: List[np.ndarray]

    # Token offsets of sentences containing each token in the document.
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    # Token metadata column values. Key is column name, value is metadata for
    # each token.
    meta_lists = _make_empty_meta_values(column_names, iob_columns)

    # Line numbers of the parsed file for each token in the doc
    doc_line_nums = []

    char_position = 0
    token_position = 0
    for sentence_num in range(len(doc)):
        sentence = doc[sentence_num]
        tokens = sentence.tokens

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool)
                                if space_before_punct else
        no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool)
                               if space_before_punct else
        no_space_before_mask[0] = True  # No space before first token
        no_space_after_mask[-1] = True  # No space after last token
        shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
        prefixes = np.where(
            np.logical_or(no_space_before_mask, shifted_no_space_after_mask),
            "", " ")
        string_parts = np.ravel((prefixes, tokens), order="F")
        sentence_text = "".join(string_parts)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        begins_list.append(b + char_position)
        ends_list.append(e + char_position)

        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))

        for k in sentence.token_metadata.keys():

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)


    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = SpanArray(doc_text, begins, ends)
    sentence_spans = TokenSpanArray(char_spans,

    ret = pd.DataFrame({"span": char_spans})
    for k, v in meta_lists.items():
        ret[k] = v
    ret["sentence"] = sentence_spans
    ret["line_num"] = pd.Series(doc_line_nums)
    return ret
 def tokens(self) -> SpanArray:
     return SpanArray(self._text, self._begins, self._ends)