Пример #1
0
def extract_split(
        doc_text: str, split_points: Union[Sequence[int], np.ndarray,
                                           SpanArray]) -> SpanArray:
    """
    Split a document into spans along a specified set of split points.

    :param doc_text: Text of the document; will be the target text of the returned spans.

    :param split_points: A series of offsets into ``doc_text``, expressed as either:
      * A sequence of integers (split at certain locations and return a set of splits that
        covers every character in the document) as a list or 1-d Numpy array
      * A sequence of spans (split around the indicated locations, but discard the parts
        of the document that are within a split point)

    :returns: An ``SpanArray``  that splits the document in the specified way.
    """
    if isinstance(split_points, (collections.abc.Sequence, np.ndarray)):
        # Single-integer split points ==> zero-length spans
        split_points = SpanArray(doc_text, split_points, split_points)
    elif not isinstance(split_points, SpanArray):
        raise TypeError(
            f"Split points are of type {type(split_points)}. Expected a "
            f"sequence of integers or a SpanArray.")

    # Make sure split points are in order
    sorted_indices = split_points.argsort()
    sorted_split_points = split_points[sorted_indices]

    # Break out the split points.
    split_begins = sorted_split_points.begin.tolist()  # type: List[int]
    split_ends = sorted_split_points.end.tolist()  # type: List[int]

    # Tack on an additional split point at the very end to simplify the logic below.
    split_begins.append(len(doc_text))
    split_ends.append(len(doc_text))

    # Walk through the document, generating the begin and end offsets of spans
    begins = []
    ends = []
    begin = 0
    for split_begin, split_end in zip(split_begins, split_ends):
        end = split_begin
        if end > begin:  # Ignore zero-length and negative-length chunks
            begins.append(begin)
            ends.append(end)
        begin = split_end

    return SpanArray(doc_text, begins, ends)
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]],
              token_features: pd.DataFrame,
              lemma_col_name: str = "lemma",
              token_span_col_name: str = "span") -> List[str]:
    """
    Convert spans to their normal form using lemma information in a token
    features table.

    :param spans: Spans to be normalized. Each may represent zero or more
    tokens.

    :param token_features: DataFrame of token metadata. Index must be aligned
    with the token indices in `spans`.

    :param lemma_col_name: Optional custom name for the DataFrame column
    containing the lemmatized form of each token.

    :param token_span_col_name: Optional custom name for the DataFrame column
    containing the span of each token.

    :return: A list containing normalized versions of the tokens
    in `spans`, with each token separated by single space character.
    """
    char_spans = SpanArray.make_array(spans)
    token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name],
                                                 char_spans)
    ret = []  # Type: List[str]
    # TODO: Vectorize this loop
    for i in range(len(token_spans)):
        lemmas = token_features[lemma_col_name][
                 token_spans.begin_token[i]:token_spans.end_token[i]
                 ]
        ret.append(" ".join(lemmas))
    return ret
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[ArrowSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray(target_text, begins, ends)
Пример #4
0
def make_tokens_and_features(
    target_text: str, language_model, add_left_and_right=False,
) -> pd.DataFrame:
    """
    :param target_text: Text to analyze
    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
     object
    :param add_left_and_right: If ``True``, add columns "left" and "right"
     containing references to previous and next tokens.

    :return: A tuple of two dataframes:

             1. The tokens of the text plus additional linguistic features that the
                language model generates, represented as a `pd.DataFrame`.
             2. A table of named entities identified by the language model's named entity
                tagger, represented as a `pd.DataFrame`.
    """
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = SpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.array)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id": range(len(tok_begins)),
        "span": tokens_series,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape": pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc],
                                  dtype=iob2_dtype),
        "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha": np.array([t.is_alpha for t in spacy_doc]),
        "is_stop": np.array([t.is_stop for t in spacy_doc]),
        "sentence": _make_sentences_series(spacy_doc, tokens_array),
    }
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array(
            [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
        )
        df_cols["right"] = pd.array(
            list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
        )
    return pd.DataFrame(df_cols)
    def __eq__(self, other):
        """
        Pandas/Numpy-style array/series comparison function.

        :param other: Second operand of a Pandas "==" comparison with the series
        that wraps this TokenSpanArray.

        :return: Returns a boolean mask indicating which rows match `other`.
        """
        if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
            # Rely on pandas to unbox and dispatch to us.
            return NotImplemented
        if isinstance(other, TokenSpan) and self.tokens.equals(other.tokens):
            mask = np.full(len(self), True, dtype=np.bool)
            mask[self.begin_token != other.begin_token] = False
            mask[self.end_token != other.end_token] = False
            return mask
        elif isinstance(other, TokenSpanArray) and self.tokens.equals(
                other.tokens):
            if len(self) != len(other):
                raise ValueError("Can't compare arrays of differing lengths "
                                 "{} and {}".format(len(self), len(other)))
            return np.logical_and(self.begin_token == other.begin_token,
                                  self.end_token == other.end_token)
        else:
            # Different tokens, no tokens, unexpected type ==> fall back on superclass
            return SpanArray.__eq__(self, other)
Пример #6
0
def make_tokens(target_text: str, tokenizer) -> pd.Series:
    """
    :param target_text: Text to tokenize
    :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object
    :return: The tokens (and underlying text) as a Pandas Series wrapped around
        a `SpanArray` value.
    """
    spacy_doc = tokenizer(target_text)
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
Пример #7
0
    def __init__(self, tokens: Any, begin_token: int, end_token: int):
        """
        :param tokens: Tokenization information about the document, including
        the target text. Must be a type that :func:`SpanArray.make_array()`
        can convert to a `SpanArray`.

        :param begin_token: Begin offset (inclusive) within the tokenized text,

        :param end_token: End offset; exclusive, one past the last token
        """
        tokens = SpanArray.make_array(tokens)
        if TokenSpan.NULL_OFFSET_VALUE != begin_token and begin_token < 0:
            raise ValueError(
                f"Begin token offset must be NULL_OFFSET_VALUE or "
                f"greater than zero (got {begin_token})")
        if TokenSpan.NULL_OFFSET_VALUE != begin_token and end_token < begin_token:
            raise ValueError(f"End must be >= begin (got {begin_token} and "
                             f"{end_token}")
        if begin_token > len(tokens):
            raise ValueError(
                f"Begin token offset of {begin_token} larger than "
                f"number of tokens ({len(tokens)})")
        if end_token > len(tokens) + 1:
            raise ValueError(f"End token offset of {end_token} larger than "
                             f"number of tokens + 1 ({len(tokens)} + 1)")
        if len(tokens) == 0 and begin_token != TokenSpan.NULL_OFFSET_VALUE:
            raise ValueError(
                f"Tried to create a non-null TokenSpan over an empty list of tokens."
            )
        if TokenSpan.NULL_OFFSET_VALUE == begin_token:
            if TokenSpan.NULL_OFFSET_VALUE != end_token:
                raise ValueError(
                    "Begin offset with special 'null' value {} "
                    "must be paired with an end offset of {}",
                    TokenSpan.NULL_OFFSET_VALUE,
                    TokenSpan.NULL_OFFSET_VALUE,
                )
            begin_char_off = end_char_off = Span.NULL_OFFSET_VALUE
        else:
            begin_char_off = tokens.begin[begin_token]
            end_char_off = (begin_char_off if begin_token == end_token else
                            tokens.end[end_token - 1])
        if len(tokens) == 0:
            doc_text = None
        elif not tokens.is_single_document:
            raise ValueError("Tokens must be from exactly one document.")
        else:
            doc_text = tokens.document_text

        super().__init__(doc_text, begin_char_off, end_char_off)
        self._tokens = tokens
        self._begin_token = begin_token
        self._end_token = end_token
Пример #8
0
def _make_entity_mentions_dataframe(
        entities: List, original_text: str,
        apply_standard_schema: bool) -> pd.DataFrame:
    """
    Unroll the records of the "mentions" element of NLU entities into a flat
    DataFrame. Schema of this DataFrame is `_entity_mentions_schema`
    above.
    :param entities: The "entities" section of a parsed NLU response
    :param original_text: Text of the document.  This argument must be provided if there
     are entity mention spans.
    :param apply_standard_schema: Value of the eponymous argument from `parse_response`.
    """
    if 0 == len(entities) or "mentions" not in entities[0].keys():
        # No mentions to unroll. Return an empty DataFrame.
        return util.apply_schema(
            pd.DataFrame(columns=[e[0] for e in _entity_mentions_schema]),
            _entity_mentions_schema, apply_standard_schema)
    if original_text is None:
        raise ValueError(
            "Unable to construct target text for converting entity mentions to spans"
        )
    # Explode out the nested relations containing entity location information.
    # If there was a version of DataFrame.explode() that could handle structs,
    # we would be able to vectorize this operation.
    # Instead we build up the values one row at a time.
    # Some columns come from "parent" entity records, and some columns come from the
    # "child" entity mention records.
    num_parent_cols = len(_entity_mentions_parent_elems)
    parent_cols = [[] for i in range(num_parent_cols)]
    begins = []
    ends = []
    confidences = []
    for e in entities:
        for m in e["mentions"]:
            for i in range(num_parent_cols):
                parent_elem = e[_entity_mentions_parent_names[i]]
                parent_cols[i].append(parent_elem)
            begins.append(m["location"][0])
            ends.append(m["location"][1])
            confidences.append(
                m["confidence"])  # N.B. confidence of mention, not entity
    # Construct columns, then convert to a DataFrame
    df_cols = {
        _entity_mentions_parent_names[i]: parent_cols[i]
        for i in range(len(_entity_mentions_parent_names))
    }
    df_cols["span"] = SpanArray(original_text, begins, ends)
    df_cols["confidence"] = confidences
    return util.apply_schema(pd.DataFrame(df_cols), _entity_mentions_schema,
                             apply_standard_schema)
Пример #9
0
    def from_char_offsets(tokens: Any) -> "TokenSpanArray":
        """
        Convenience factory method for wrapping the character-level spans of a
        series of tokens into single-token token-based spans.

        :param tokens: character-based offsets of the tokens, as any type that
         :func:`SpanArray.make_array()` understands.

        :return: A TokenSpanArray containing single-token spans for each of the
        tokens in `tokens`.
        """
        begin_tokens = np.arange(len(tokens))
        tokens_array = SpanArray.make_array(tokens)
        return TokenSpanArray(tokens_array, begin_tokens, begin_tokens + 1)
Пример #10
0
def make_tokens(target_text: str, tokenizer: "spacy.tokenizer.Tokenizer" = None) \
    -> pd.Series:
    """
    :param target_text: Text to tokenize
    :param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object, or None
     to use the tokenizer returned by :func:`simple_tokenizer()`
    :return: The tokens (and underlying text) as a Pandas Series wrapped around
        a `SpanArray` value.
    """
    if tokenizer is None:
        tokenizer = simple_tokenizer()
    spacy_doc = tokenizer(target_text)
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
Пример #11
0
def extract_regex_tok(
    tokens: Union[SpanArray, pd.Series],
    compiled_regex: regex.Regex,
    min_len=1,
    max_len=1,
    output_col_name: str = "match",
):
    """
    Identify all (possibly overlapping) matches of a regular expression
    that start and end on token boundaries.

    :param tokens: ``SpanArray`` of token information, optionally wrapped in a
    `pd.Series`.

    :param compiled_regex: Regular expression to evaluate.

    :param min_len: Minimum match length in tokens

    :param max_len: Maximum match length (inclusive) in tokens

    :param output_col_name: (optional) name of column of matching spans in the
    returned DataFrame

    :returns: A single-column DataFrame containing a span for each match of the
    regex.
    """
    tokens = SpanArray.make_array(tokens)

    num_tokens = len(tokens)
    matches_regex_f = np.vectorize(
        lambda s: compiled_regex.fullmatch(s) is not None)

    # The built-in regex functionality of Pandas/Python does not have
    # an optimized single-pass RegexTok, so generate all the places
    # where there might be a match and run them through regex.fullmatch().
    # Note that this approach is asymptotically inefficient if max_len is large.
    # TODO: Performance tuning for both small and large max_len
    matches_list = []
    for cur_len in range(min_len, max_len + 1):
        window_begin_toks = np.arange(0, num_tokens - cur_len + 1)
        window_end_toks = window_begin_toks + cur_len

        window_tok_spans = TokenSpanArray(tokens, window_begin_toks,
                                          window_end_toks)
        matches_list.append(
            pd.Series(window_tok_spans[matches_regex_f(
                window_tok_spans.covered_text)]))
    return pd.DataFrame({output_col_name: pd.concat(matches_list)})
Пример #12
0
def arrow_to_span(extension_array: pa.ExtensionArray) -> SpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowSpanType to
    a SpanArray.

    ..NOTE: Only supported with PyArrow >= 2.0.0

    :param extension_array: pyarrow.ExtensionArray with type ArrowSpanType
    :return: SpanArray
    """
    if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
        raise NotImplementedError(
            "Arrow serialization for SpanArray is not supported with "
            "PyArrow versions < 2.0.0")
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    # NOTE: workaround for bug in parquet reading
    if pa.types.is_struct(extension_array.type):
        index_dtype = extension_array.field(ArrowSpanType.BEGINS_NAME).type
        target_text_dict_dtype = extension_array.field(
            ArrowSpanType.TARGET_TEXT_DICT_NAME).type
        extension_array = pa.ExtensionArray.from_storage(
            ArrowSpanType(index_dtype, target_text_dict_dtype),
            extension_array)

    assert pa.types.is_struct(extension_array.storage.type)

    # Create target text StringTable and text_ids from dictionary array
    target_text_dict_array = extension_array.storage.field(
        ArrowSpanType.TARGET_TEXT_DICT_NAME)
    table_texts = target_text_dict_array.dictionary.to_pylist()
    string_table = StringTable.from_things(table_texts)
    text_ids = target_text_dict_array.indices.to_numpy()

    # Get the begins/ends pyarrow arrays
    begins_array = extension_array.storage.field(ArrowSpanType.BEGINS_NAME)
    ends_array = extension_array.storage.field(ArrowSpanType.ENDS_NAME)

    # Zero-copy convert arrays to numpy
    begins = begins_array.to_numpy()
    ends = ends_array.to_numpy()

    return SpanArray((string_table, text_ids), begins, ends)
Пример #13
0
 def test_left_to_right(self):
     test_text = "Is it weird in here, or is it just me?"
     spans = [
         Span(test_text, 0, 3),
         Span(test_text, 2, 3),
         Span(test_text, 3, 3),
         Span(test_text, 1, 3),
         Span(test_text, 0, 4),  # index 4
         Span(test_text, 5, 7),  # index 5
         Span(test_text, 6, 9),
         Span(test_text, 8, 9),  # index 7
     ]
     df = pd.DataFrame({
         "s": SpanArray._from_sequence(spans),
         "ix": range(len(spans))
     })
     c_df = consolidate(df, on="s", how="left_to_right")
     self._assertArrayEquals(list(c_df.index), [4, 5, 7])
Пример #14
0
    def __eq__(self, other):
        """
        Pandas/Numpy-style array/series comparison function.

        :param other: Second operand of a Pandas "==" comparison with the series
        that wraps this TokenSpanArray.

        :return: Returns a boolean mask indicating which rows match `other`.
        """
        if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndex)):
            # Rely on pandas to unbox and dispatch to us.
            return NotImplemented
        elif (isinstance(other, TokenSpanArray) and len(self) == len(other)
              and self.same_tokens(other)):
            return np.logical_and(self.begin_token == other.begin_token,
                                  self.end_token == other.end_token)
        else:
            # Different tokens, no tokens, unexpected type ==> fall back on superclass
            return SpanArray.__eq__(self, other)
Пример #15
0
def extract_regex(
    doc_text: str,
    compiled_regex: "re.Pattern"  # Double quotes for Python 3.6 compatibility
):
    """
    Identify all non-overlapping matches of a regular expression, as returned by
    ``re.Pattern.finditer()``, and return those locations as an array of spans.

    :param doc_text: Text of the document; will be the target text of the returned spans.

    :param compiled_regex: Regular expression to evaluate, compiled with either the ``re``
      or the ``regex`` package.

    :returns: A ``SpanArray`` containing a span for each match of the regex.
    """
    begins = []
    ends = []
    for a in compiled_regex.finditer(doc_text):
        begins.append(a.start())
        ends.append(a.end())

    return SpanArray(doc_text, begins, ends)
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to
    a TokenSpanArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType
    :return: TokenSpanArray
    """
    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            raise ValueError(
                "Only pyarrow.Array with a single chunk is supported")
        extension_array = extension_array.chunk(0)

    assert pa.types.is_struct(extension_array.storage.type)

    # Get target text from the begins field metadata and decode string
    metadata = extension_array.storage.type[
        ArrowTokenSpanType.BEGINS_NAME].metadata
    target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY]
    if isinstance(target_text, bytes):
        target_text = target_text.decode()

    # Get the begins/ends pyarrow arrays
    token_begins_array = extension_array.storage.field(
        ArrowTokenSpanType.BEGINS_NAME)
    token_ends_array = extension_array.storage.field(
        ArrowTokenSpanType.ENDS_NAME)

    # Check if CharSpans have been split
    num_char_span_splits = extension_array.type.num_char_span_splits
    if num_char_span_splits > 0:
        char_begins_splits = []
        char_ends_splits = []
        for i in range(num_char_span_splits):
            char_begins_splits.append(
                extension_array.storage.field(ArrowSpanType.BEGINS_NAME +
                                              "_{}".format(i)))
            char_ends_splits.append(
                extension_array.storage.field(ArrowSpanType.ENDS_NAME +
                                              "_{}".format(i)))
        char_begins_array = pa.concat_arrays(char_begins_splits)
        char_ends_array = pa.concat_arrays(char_ends_splits)
    else:
        char_begins_array = extension_array.storage.field(
            ArrowSpanType.BEGINS_NAME)
        char_ends_array = extension_array.storage.field(
            ArrowSpanType.ENDS_NAME)

    # Remove any trailing padding
    if char_begins_array.null_count > 0:
        char_begins_array = char_begins_array[:-char_begins_array.null_count]
        char_ends_array = char_ends_array[:-char_ends_array.null_count]

    # Zero-copy convert arrays to numpy
    token_begins = token_begins_array.to_numpy()
    token_ends = token_ends_array.to_numpy()
    char_begins = char_begins_array.to_numpy()
    char_ends = char_ends_array.to_numpy()

    # Create the SpanArray, then the TokenSpanArray
    char_span = SpanArray(target_text, char_begins, char_ends)
    return TokenSpanArray(char_span, token_begins, token_ends)
Пример #17
0
def _doc_to_df(doc: List[_SentenceData], column_names: List[str],
               iob_columns: List[bool],
               space_before_punct: bool) -> pd.DataFrame:
    """
    Convert the "Python objects" representation of a document from a
    CoNLL-2003 file into a `pd.DataFrame` of token metadata.

    :param doc: List of Python objects that represents the document.
    :param column_names: Names for the metadata columns that come after the
     token text. These names will be used to generate the names of the dataframe
     that this function returns.
    :param iob_columns: Mask indicating which of the metadata columns after the
     token text should be treated as being in IOB format. If a column is in IOB format,
     the returned dataframe will contain *two* columns, holding IOB2 tags and
     entity type tags, respectively. For example, an input column "ent" will turn into
     output columns "ent_iob" and "ent_type".
    :param space_before_punct: If `True`, add whitespace before
     punctuation characters (and after left parentheses)
     when reconstructing the text of the document.
    :return: DataFrame with four columns:
    * `span`: Span of each token, with character offsets.
      Backed by the concatenation of the tokens in the document into
      a single string with one sentence per line.
    * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
      in the original file, with no corrections applied.
    * `ent_type`: Entity type names for tokens tagged "I" or "B" in
      the `ent_iob` column; `None` everywhere else.
    * `line_num`: line number of each token in the parsed file
    """

    # Character offsets of tokens in the reconstructed document
    begins_list = []  # Type: List[np.ndarray]
    ends_list = []  # Type: List[np.ndarray]

    # Reconstructed text of each sentence
    sentences_list = []  # Type: List[np.ndarray]

    # Token offsets of sentences containing each token in the document.
    sentence_begins_list = []  # Type: List[np.ndarray]
    sentence_ends_list = []  # Type: List[np.ndarray]

    # Token metadata column values. Key is column name, value is metadata for
    # each token.
    meta_lists = _make_empty_meta_values(column_names, iob_columns)

    # Line numbers of the parsed file for each token in the doc
    doc_line_nums = []

    char_position = 0
    token_position = 0
    for sentence_num in range(len(doc)):
        sentence = doc[sentence_num]
        tokens = sentence.tokens

        # Don't put spaces before punctuation in the reconstituted string.
        no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool)
                                if space_before_punct else
                                _SPACE_BEFORE_MATCH_FN(tokens))
        no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool)
                               if space_before_punct else
                               _SPACE_AFTER_MATCH_FN(tokens))
        no_space_before_mask[0] = True  # No space before first token
        no_space_after_mask[-1] = True  # No space after last token
        shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
        prefixes = np.where(
            np.logical_or(no_space_before_mask, shifted_no_space_after_mask),
            "", " ")
        string_parts = np.ravel((prefixes, tokens), order="F")
        sentence_text = "".join(string_parts)
        sentences_list.append(sentence_text)

        lengths = np.array([len(t) for t in tokens])
        prefix_lengths = np.array([len(p) for p in prefixes])

        # Begin and end offsets, accounting for which tokens have spaces
        # before them.
        e = np.cumsum(lengths + prefix_lengths)
        b = e - lengths
        begins_list.append(b + char_position)
        ends_list.append(e + char_position)

        sentence_begin_token = token_position
        sentence_end_token = token_position + len(e)
        sentence_begins = np.repeat(sentence_begin_token, len(e))
        sentence_ends = np.repeat(sentence_end_token, len(e))
        sentence_begins_list.append(sentence_begins)
        sentence_ends_list.append(sentence_ends)

        for k in sentence.token_metadata.keys():
            meta_lists[k].extend(sentence.token_metadata[k])

        char_position += e[-1] + 1  # "+ 1" to account for newline
        token_position += len(e)

        doc_line_nums.extend(sentence.line_nums)

    begins = np.concatenate(begins_list)
    ends = np.concatenate(ends_list)
    doc_text = "\n".join(sentences_list)
    char_spans = SpanArray(doc_text, begins, ends)
    sentence_spans = TokenSpanArray(char_spans,
                                    np.concatenate(sentence_begins_list),
                                    np.concatenate(sentence_ends_list))

    ret = pd.DataFrame({"span": char_spans})
    for k, v in meta_lists.items():
        ret[k] = v
    ret["sentence"] = sentence_spans
    ret["line_num"] = pd.Series(doc_line_nums)
    return ret
Пример #18
0
def make_bert_tokens(target_text: str, tokenizer) -> pd.DataFrame:
    """
    Tokenize the indicated text for BERT embeddings and return a DataFrame
    with one row per token.

    :param: target_text: string to tokenize
    :param: tokenizer: A tokenizer that is a subclass of huggingface transformers
                       PreTrainingTokenizerFast which supports `encode_plus` with
                       return_offsets_mapping=True.

    :returns: ``pd.DataFrame`` with following columns:

              * "id": unique integer ID for each token
              * "span": span of the token (with offsets measured in characters)
              * "input_id": integer ID suitable for input to a BERT embedding model
              * "token_type_id": list of token type ids to be fed to a model
              * "attention_mask": list of indices specifying which tokens should be
                attended to by the model
              * "special_tokens_mask": `True` if the token is a zero-length special token
                such as "start of document"

    """
    # noinspection PyPackageRequirements
    from transformers import PreTrainedTokenizerFast

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise TypeError("Tokenizer must be an instance of "
                        "transformers.PreTrainedTokenizerFast that supports "
                        "encode_plus with return_offsets_mapping=True.")
    tokenized_result = tokenizer.encode_plus(target_text,
                                             return_special_tokens_mask=True,
                                             return_offsets_mapping=True)

    # Get offset mapping from tokenizer
    offsets = tokenized_result["offset_mapping"]

    # Init any special tokens at beginning
    i = 0
    while offsets[i] is None:
        offsets[i] = (0, 0)
        i += 1

    # Make a DataFrame to unzip (begin, end) offsets
    offset_df = pd.DataFrame(offsets, columns=["begin", "end"])

    # Convert special tokens mask to boolean
    special_tokens_mask = pd.Series(
        tokenized_result["special_tokens_mask"]).astype("bool")

    # Fill remaining special tokens to zero-length spans
    ends = offset_df["end"].fillna(method="ffill").astype("int32")
    begins = offset_df["begin"].mask(special_tokens_mask,
                                     other=ends).astype("int32")

    spans = SpanArray(target_text, begins, ends)

    token_features = pd.DataFrame({
        "token_id":
        special_tokens_mask.index,
        "span":
        spans,
        "input_id":
        tokenized_result["input_ids"],
        "token_type_id":
        tokenized_result["token_type_ids"],
        "attention_mask":
        tokenized_result["attention_mask"],
        "special_tokens_mask":
        special_tokens_mask,
    })

    return token_features
Пример #19
0
    def align_to_tokens(cls, tokens: Any, spans: Any):
        """
        Align a set of character or token-based spans to a specified
        tokenization, producing a `TokenSpanArray` of token-based spans.

        :param tokens: The tokens to align to, as any type that
         `SpanArray.make_array()` accepts.
        :param spans: The spans to align. These spans must all target the same text
         as `tokens`.
        :return: An array of `TokenSpan`s aligned to the tokens of `tokens`.
         Raises `ValueError` if any of the spans in `spans` doesn't start and
         end on a token boundary.
        """
        tokens = SpanArray.make_array(tokens)
        spans = SpanArray.make_array(spans)

        if not tokens.is_single_document:
            raise ValueError(
                f"Tokens cover more than one document (tokens are {tokens})")
        if not spans.is_single_document:
            raise ValueError(
                f"Spans cover more than one document (spans are {spans})")

        # Create and join temporary dataframes
        tokens_df = pd.DataFrame({
            "token_index": np.arange(len(tokens)),
            "token_begin": tokens.begin,
            "token_end": tokens.end
        })
        spans_df = pd.DataFrame({
            "span_index": np.arange(len(spans)),
            "span_begin": spans.begin,
            "span_end": spans.end
        })

        # Ignore zero-length tokens
        # TODO: Is this the right thing to do?
        tokens_df = tokens_df[
            tokens_df["token_begin"] != tokens_df["token_end"]]

        begin_matches = pd.merge(tokens_df,
                                 spans_df,
                                 left_on="token_begin",
                                 right_on="span_begin",
                                 how="right",
                                 indicator=True)

        mismatched = begin_matches[begin_matches["_merge"] == "right_only"]
        if len(mismatched.index) > 0:
            raise ValueError(
                f"The following span(s) did not align with the begin offset\n"
                f"of any token:\n"
                f"{mismatched[['span_index', 'span_begin', 'span_end']]}")

        end_matches = pd.merge(tokens_df,
                               spans_df,
                               left_on="token_end",
                               right_on="span_end",
                               how="right",
                               indicator=True)

        mismatched = end_matches[end_matches["_merge"] == "right_only"]
        if len(mismatched.index) > 0:
            raise ValueError(
                f"The following span(s) did not align with the end offset\n"
                f"of any token:\n"
                f"{mismatched[['span_index', 'span_begin', 'span_end']]}")

        # Join on span index to get (begin, end) pairs.
        begins_and_ends = pd.merge(begin_matches[["token_index",
                                                  "span_index"]],
                                   end_matches[["token_index", "span_index"]],
                                   on="span_index",
                                   suffixes=("_begin", "_end"),
                                   sort=True)

        return TokenSpanArray(tokens, begins_and_ends["token_index_begin"],
                              begins_and_ends["token_index_end"] + 1)
Пример #20
0
 def __hash__(self):
     if self._hash is None:
         # Use superclass hash function so that hash() and == are consistent
         # across type.
         self._hash = SpanArray.__hash__(self)
     return self._hash
Пример #21
0
            return Span.__lt__(self, other)

    @property
    def tokens(self):
        return self._tokens

    @property
    def begin_token(self):
        return self._begin_token

    @property
    def end_token(self):
        return self._end_token


_EMPTY_SPAN_ARRAY_SINGLETON = SpanArray("", [], [])

_NULL_TOKEN_SPAN_SINGLETON = TokenSpan(_EMPTY_SPAN_ARRAY_SINGLETON,
                                       Span.NULL_OFFSET_VALUE,
                                       Span.NULL_OFFSET_VALUE)


@pd.api.extensions.register_extension_dtype
class TokenSpanDtype(SpanDtype):
    """
    Pandas datatype for a span that represents a range of tokens within a
    target string.
    """
    @property
    def type(self):
        # The type for a single row of a column of type TokenSpan
 def tokens(self) -> SpanArray:
     return SpanArray(self._text, self._begins, self._ends)