def token_span_to_arrow(token_span: TokenSpanArray) -> pa.ExtensionArray: """ Convert a TokenSpanArray to a pyarrow.ExtensionArray with a type of ArrowTokenSpanType and struct as the storage type. The resulting extension array can be serialized and transferred with standard Arrow protocols. :param token_span: A TokenSpanArray to be converted :return: pyarrow.ExtensionArray containing TokenSpan data """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for TokenSpanArray is not supported with " "PyArrow versions < 2.0.0") # Create arrays for begins/ends token_begins_array = pa.array(token_span.begin_token) token_ends_array = pa.array(token_span.end_token) # Filter out any empty SpanArrays non_null_tokens = token_span.tokens[~token_span.isna()] assert len(non_null_tokens) > 0 # Get either single document as a list or use a list of all if multiple docs if all([token is non_null_tokens[0] for token in non_null_tokens]): tokens_arrays = [non_null_tokens[0]] tokens_indices = pa.array([0] * len(token_span.tokens), mask=token_span.isna()) else: raise NotImplementedError( "TokenSpan Multi-doc serialization not yet implemented due to " "ArrowNotImplementedError: Concat with dictionary unification NYI") tokens_arrays = non_null_tokens tokens_indices = np.zeros_like(token_span.tokens) tokens_indices[~token_span.isna()] = range(len(tokens_arrays)) tokens_indices = pa.array(tokens_indices, mask=token_span.isna()) # Convert each token SpanArray to Arrow and get as raw storage arrow_tokens_arrays = [span_to_arrow(sa).storage for sa in tokens_arrays] # Create a list array with each element is an ArrowSpanArray # TODO: pyarrow.lib.ArrowNotImplementedError: ('Sequence converter for type dictionary<values=string, indices=int8, ordered=0> not implemented', 'Conversion failed for column ts1 with type TokenSpanDtype') #arrow_tokens_arrays_array = pa.array(arrow_tokens_arrays, type=pa.list_(arrow_tokens_arrays[0].type)) offsets = [0] + [len(a) for a in arrow_tokens_arrays] values = pa.concat_arrays( arrow_tokens_arrays) # TODO: can't concat extension arrays? arrow_tokens_arrays_array = pa.ListArray.from_arrays(offsets, values) # Create a dictionary array mapping each token SpanArray index used to the list of ArrowSpanArrays tokens_dict_array = pa.DictionaryArray.from_arrays( tokens_indices, arrow_tokens_arrays_array) typ = ArrowTokenSpanType(token_begins_array.type, tokens_dict_array.type) fields = list(typ.storage_type) storage = pa.StructArray.from_arrays( [token_begins_array, token_ends_array, tokens_dict_array], fields=fields) return pa.ExtensionArray.from_storage(typ, storage)
def make_tokens_and_features( target_text: str, language_model, add_left_and_right=False, ) -> pd.DataFrame: """ :param target_text: Text to analyze :param language_model: Preconfigured spaCy language model (`spacy.language.Language`) object :param add_left_and_right: If ``True``, add columns "left" and "right" containing references to previous and next tokens. :return: A tuple of two dataframes: 1. The tokens of the text plus additional linguistic features that the language model generates, represented as a `pd.DataFrame`. 2. A table of named entities identified by the language model's named entity tagger, represented as a `pd.DataFrame`. """ spacy_doc = language_model(target_text) # TODO: Performance tuning of the translation code that follows # Represent the character spans of the tokens tok_begins = np.array([t.idx for t in spacy_doc]) tok_ends = np.array([t.idx + len(t) for t in spacy_doc]) tokens_array = SpanArray(target_text, tok_begins, tok_ends) tokens_series = pd.Series(tokens_array) # Also build single-token token-based spans to make it easier to build # larger token-based spans. token_spans = TokenSpanArray.from_char_offsets(tokens_series.array) # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice, # the offset of the first character in the token). Translate from these # to a dense range of integer IDs that will correspond to the index of our # returned DataFrame. idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))} # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) df_cols = { "id": range(len(tok_begins)), "span": tokens_series, "lemma": [t.lemma_ for t in spacy_doc], "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]), "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]), "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]), "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]), "shape": pd.Categorical([t.shape_ for t in spacy_doc]), "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc], dtype=iob2_dtype), "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]), "is_alpha": np.array([t.is_alpha for t in spacy_doc]), "is_stop": np.array([t.is_stop for t in spacy_doc]), "sentence": _make_sentences_series(spacy_doc, tokens_array), } if add_left_and_right: # Use nullable int type because these columns contain nulls df_cols["left"] = pd.array( [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype() ) df_cols["right"] = pd.array( list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype() ) return pd.DataFrame(df_cols)
def align_bert_tokens_to_corpus_tokens( spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame) -> pd.DataFrame: """ Expand entity matches from a BERT-based model so that they align with the corpus's original tokenization. :param spans_df: DataFrame of extracted entities. Must contain two columns: "span" and "ent_type". Other columns ignored. :param corpus_toks_df: DataFrame of the corpus's original tokenization, one row per token. Must contain a column "span" with character-based spans of the tokens. :returns: A new DataFrame with schema ["span", "ent_type"], where the "span" column contains token-based spans based off the *corpus* tokenization in `corpus_toks_df["span"]`. """ if len(spans_df.index) == 0: return spans_df.copy() overlaps_df = (spanner.overlap_join(spans_df["span"], corpus_toks_df["span"], "span", "corpus_token").merge(spans_df)) agg_df = (overlaps_df.groupby("span").aggregate({ "corpus_token": "sum", "ent_type": "first" }).reset_index()) cons_df = (spanner.consolidate(agg_df, "corpus_token")[[ "corpus_token", "ent_type" ]].rename(columns={"corpus_token": "span"})) cons_df["span"] = TokenSpanArray.align_to_tokens(corpus_toks_df["span"], cons_df["span"]) return cons_df
def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]], token_features: pd.DataFrame, lemma_col_name: str = "lemma", token_span_col_name: str = "span") -> List[str]: """ Convert spans to their normal form using lemma information in a token features table. :param spans: Spans to be normalized. Each may represent zero or more tokens. :param token_features: DataFrame of token metadata. Index must be aligned with the token indices in `spans`. :param lemma_col_name: Optional custom name for the DataFrame column containing the lemmatized form of each token. :param token_span_col_name: Optional custom name for the DataFrame column containing the span of each token. :return: A list containing normalized versions of the tokens in `spans`, with each token separated by single space character. """ char_spans = SpanArray.make_array(spans) token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name], char_spans) ret = [] # Type: List[str] # TODO: Vectorize this loop for i in range(len(token_spans)): lemmas = token_features[lemma_col_name][ token_spans.begin_token[i]:token_spans.end_token[i] ] ret.append(" ".join(lemmas)) return ret
def _make_empty_series() -> pd.Series: """ Zero-length TokenSpanArray wrapped in a series. Note that this array has zero spans but *does* contain token and text information. """ return pd.Series( TokenSpanArray(_TOKENS_ARRAY, [], []) )
def _make_join_arg(self) -> pd.Series: """ Shared example join argument used by most of the test cases that follow. """ return pd.Series( TokenSpanArray._from_sequence( [ TokenSpan(_TOKENS_ARRAY, 23, 28), # Knights of the Round Table TokenSpan(_TOKENS_ARRAY, 17, 19), # searching for TokenSpan(_TOKENS_ARRAY, 1, 2), # In TokenSpan(_TOKENS_ARRAY, 1, 2), # In (second copy) TokenSpan(_TOKENS_ARRAY, 42, 45), # Lancelot the Brave ] ) )
def extract_regex_tok( tokens: Union[SpanArray, pd.Series], compiled_regex: regex.Regex, min_len=1, max_len=1, output_col_name: str = "match", ): """ Identify all (possibly overlapping) matches of a regular expression that start and end on token boundaries. :param tokens: ``SpanArray`` of token information, optionally wrapped in a `pd.Series`. :param compiled_regex: Regular expression to evaluate. :param min_len: Minimum match length in tokens :param max_len: Maximum match length (inclusive) in tokens :param output_col_name: (optional) name of column of matching spans in the returned DataFrame :returns: A single-column DataFrame containing a span for each match of the regex. """ tokens = SpanArray.make_array(tokens) num_tokens = len(tokens) matches_regex_f = np.vectorize( lambda s: compiled_regex.fullmatch(s) is not None) # The built-in regex functionality of Pandas/Python does not have # an optimized single-pass RegexTok, so generate all the places # where there might be a match and run them through regex.fullmatch(). # Note that this approach is asymptotically inefficient if max_len is large. # TODO: Performance tuning for both small and large max_len matches_list = [] for cur_len in range(min_len, max_len + 1): window_begin_toks = np.arange(0, num_tokens - cur_len + 1) window_end_toks = window_begin_toks + cur_len window_tok_spans = TokenSpanArray(tokens, window_begin_toks, window_end_toks) matches_list.append( pd.Series(window_tok_spans[matches_regex_f( window_tok_spans.covered_text)])) return pd.DataFrame({output_col_name: pd.concat(matches_list)})
def align_bert_tokens_to_corpus_tokens( spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame, spans_df_token_col: str = "span", corpus_df_token_col: str = "span", entity_type_col: str = "ent_type", ) -> pd.DataFrame: """ Expand entity matches from a BERT-based model so that they align with the corpus's original tokenization. :param spans_df: DataFrame of extracted entities. Must contain two columns with span and entity type information, respectively. Other columns ignored. :param corpus_toks_df: DataFrame of the corpus's original tokenization, one row per token. Must contain a column with character-based spans of the tokens. :param spans_df_token_col: the name of the column in ``spans_df`` containing its tokenization. By default, ``'span'`` :param corpus_df_token_col: the name of the column in ``corpus_toks_df`` that contains its tokenization. By default ```'span'`` :param entity_type_col: the name of the column in spans_df that contains the entity types of the elements :returns: A new DataFrame with schema ``["span", "ent_type"]``, where the "span" column contains token-based spans based off the *corpus* tokenization in ``corpus_toks_df["span"]``. """ if len(spans_df.index) == 0: return spans_df.copy() overlaps_df = spanner.overlap_join( spans_df[spans_df_token_col], corpus_toks_df[corpus_df_token_col], "span", "corpus_token", ).merge(spans_df, left_on="span", right_on=spans_df_token_col) agg_df = (overlaps_df.groupby("span").aggregate({ "corpus_token": "sum", entity_type_col: "first" }).reset_index()) cons_df = spanner.consolidate(agg_df, "corpus_token")[[ "corpus_token", entity_type_col ]].rename(columns={"corpus_token": "span"}) cons_df["span"] = TokenSpanArray.align_to_tokens( corpus_toks_df[corpus_df_token_col], cons_df["span"]) return cons_df
def make_span_from_entities(char_span: SpanArray, entities_frame: pd.DataFrame, entity_col: str = "text") -> TokenSpanArray: """ Create a token span array for entity text from the entities DataFrame, and an existing char span array with tokens from the entire analyzed text. :param char_span: Parsed tokens :param entities_frame: Entities DataFrame from `parse_response` :param entity_col: Column name for the entity text :return: TokenSpanArray for matching entities """ entities = entities_frame[entity_col] entities_len = entities.str.len() begins = [] ends = [] i = 0 while i < len(char_span): span = char_span[i] text = span.covered_text end = i num_tokens = 1 stop = False while not stop: stop = True starts_with = entities.str.startswith(text) if any(starts_with): # Have a complete match, advance the end index if any(entities_len[starts_with] == len(text)): end = i + num_tokens # Try the next token if i + num_tokens < len(char_span): span = char_span[i + num_tokens] text = text + " " + span.covered_text num_tokens += 1 stop = False if i != end: begins.append(i) ends.append(end) i += (end - i) else: i += 1 return TokenSpanArray(char_span, begins, ends)
def test_overlaps_join(self): join_arg = pd.Series( TokenSpanArray._from_sequence( [ TokenSpan(_TOKENS_ARRAY, 23, 28), # Knights of the Round Table TokenSpan(_TOKENS_ARRAY, 17, 19), # searching for TokenSpan(_TOKENS_ARRAY, 1, 2), # In TokenSpan(_TOKENS_ARRAY, 1, 2), # In (second copy) TokenSpan(_TOKENS_ARRAY, 42, 45), # Lancelot the Brave ] ) ) result1 = overlap_join(join_arg, _CAPS_WORD["match"]) self.assertEqual( str(result1), textwrap.dedent( """\ first second 0 [23, 28): 'Knights of the Round Table' [23, 24): 'Knights' 1 [23, 28): 'Knights of the Round Table' [26, 27): 'Round' 2 [23, 28): 'Knights of the Round Table' [27, 28): 'Table' 3 [1, 2): 'In' [1, 2): 'In' 4 [1, 2): 'In' [1, 2): 'In' 5 [42, 45): 'Lancelot the Brave' [42, 43): 'Lancelot' 6 [42, 45): 'Lancelot the Brave' [44, 45): 'Brave'""" ), ) result2 = overlap_join(_CAPS_WORD["match"], join_arg) self.assertEqual( str(result2), textwrap.dedent( """\ first second 0 [1, 2): 'In' [1, 2): 'In' 1 [1, 2): 'In' [1, 2): 'In' 2 [23, 24): 'Knights' [23, 28): 'Knights of the Round Table' 3 [26, 27): 'Round' [23, 28): 'Knights of the Round Table' 4 [27, 28): 'Table' [23, 28): 'Knights of the Round Table' 5 [42, 43): 'Lancelot' [42, 45): 'Lancelot the Brave' 6 [44, 45): 'Brave' [42, 45): 'Lancelot the Brave'""" ), )
def _make_sentences_series(spacy_doc, tokens: SpanArray): """ Subroutine of :func:`make_tokens_and_features` :param spacy_doc: parsed document (:class:`spacy.tokens.doc.Doc`) from a spaCy language model :param tokens: Token information for the current document as a :class:`SpanArray` object. Must contain the same tokens as `spacy_doc`. :returns: a Pandas DataFrame Series containing the token span of the (single) sentence that the token is in """ num_toks = len(spacy_doc) # Generate the [begin, end) intervals that make up a series of spans begin_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32) end_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32) for sent in spacy_doc.sents: begin_tokens[sent.start: sent.end] = sent.start end_tokens[sent.start: sent.end] = sent.end return pd.Series(TokenSpanArray(tokens, begin_tokens, end_tokens))
def conll_to_bert( df: pd.DataFrame, tokenizer: Any, bert: Any, token_class_dtype: pd.CategoricalDtype, compute_embeddings: bool = True, overlap: int = 32, non_overlap: int = 64, ) -> pd.DataFrame: """ :param df: One DataFrame from the :func:`conll_2003_to_dataframes` function, representing the tokens of a single document in the original tokenization. :param tokenizer: BERT tokenizer instance from the `transformers` library :param bert: PyTorch-based BERT model from the `transformers` library :param token_class_dtype: Pandas categorical type for representing token class labels, as returned by :func:`make_iob_tag_categories` :param compute_embeddings: True to generate BERT embeddings at each token position and add a column "embedding" to the returned DataFrame with the embeddings :param overlap: (optional) how much overlap there should be between adjacent windows for embeddings :param non_overlap: (optional) how much non-overlapping content between the overlapping regions there should be at the middle of each window? :returns: A version of the same DataFrame, but with BERT tokens, BERT embeddings for each token (if ``compute_embeddings`` is ``True``), and token class labels. """ spans_df = conll.iob_to_spans(df) bert_toks_df = make_bert_tokens(df["span"].values[0].target_text, tokenizer) bert_token_spans = TokenSpanArray.align_to_tokens(bert_toks_df["span"], spans_df["span"]) bert_toks_df[["ent_iob", "ent_type"]] = conll.spans_to_iob(bert_token_spans, spans_df["ent_type"]) bert_toks_df = conll.add_token_classes(bert_toks_df, token_class_dtype) if compute_embeddings: bert_toks_df = add_embeddings(bert_toks_df, bert, overlap, non_overlap) return bert_toks_df
def _make_syntax_dataframes(syntax_response, original_text): tokens = syntax_response.get("tokens", []) sentence = syntax_response.get("sentences", []) if len(tokens) > 0: token_table = util.make_table(tokens) location_col, location_name = util.find_column(token_table, "location") text_col, text_name = util.find_column(token_table, "text") char_span = util.make_char_span(location_col, text_col, original_text) # Drop location, text columns that is duplicated in char_span token_table = token_table.drop([location_name, text_name]) # Add the span columns to the DataFrames token_df = token_table.to_pandas() token_df['span'] = char_span else: char_span = None token_df = pd.DataFrame() if len(sentence) > 0: sentence_table = util.make_table(sentence) sentence_df = sentence_table.to_pandas() if char_span is not None: location_col, _ = util.find_column(sentence_table, "location") text_col, _ = util.find_column(sentence_table, "text") sentence_char_span = util.make_char_span(location_col, text_col, original_text) sentence_span = TokenSpanArray.align_to_tokens( char_span, sentence_char_span) sentence_df['span'] = sentence_char_span sentence_df['sentence_span'] = sentence_span else: sentence_df = pd.DataFrame() return token_df, sentence_df
def iob_to_spans( token_features: pd.DataFrame, iob_col_name: str = "ent_iob", span_col_name: str = "span", entity_type_col_name: str = "ent_type", ): """ Convert token tags in Inside–Outside–Beginning (IOB2) format to a series of `TokenSpan`s of entities. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) for more information on IOB2 format. :param token_features: DataFrame of token features in the format returned by `make_tokens_and_features`. :param iob_col_name: Name of a column in `token_features` that contains the IOB2 tags as strings, "I", "O", or "B". :param span_col_name: Name of a column in `token_features` that contains the tokens as a `SpanArray`. :param entity_type_col_name: Optional name of a column in `token_features` that contains entity type information; or `None` if no such column exists. :return: A `pd.DataFrame` with the following columns: * `span`: Span (with token offsets) of each entity * `<value of entity_type_col_name>`: (optional) Entity type """ # Start out with 1-token prefixes of all entities. begin_mask = token_features[iob_col_name] == "B" first_tokens = token_features[begin_mask].index if entity_type_col_name is None: entity_types = np.zeros(len(first_tokens)) else: entity_types = token_features[begin_mask][entity_type_col_name] # Add an extra "O" tag to the end of the IOB column to simplify the logic # for handling the case where the document ends with an entity. iob_series = (token_features[iob_col_name].append(pd.Series( ["O"])).reset_index(drop=True)) entity_prefixes = pd.DataFrame({ "ent_type": entity_types, "begin": first_tokens, # Inclusive "end": first_tokens + 1, # Exclusive "next_tag": iob_series.iloc[first_tokens + 1].values, }) df_list = [] # Type: pd.DataFrame if len(entity_prefixes.index) == 0: # Code below needs at least one element in the list for schema df_list = [entity_prefixes] # Iteratively expand the prefixes while len(entity_prefixes.index) > 0: complete_mask = entity_prefixes["next_tag"].isin(["O", "B"]) complete_entities = entity_prefixes[complete_mask] incomplete_entities = entity_prefixes[~complete_mask].copy() incomplete_entities["end"] = incomplete_entities["end"] + 1 incomplete_entities["next_tag"] = iob_series.iloc[ incomplete_entities["end"]].values df_list.append(complete_entities) entity_prefixes = incomplete_entities all_entities = pd.concat(df_list) # Sort spans by location, not length. all_entities.sort_values("begin", inplace=True) # Convert [begin, end) pairs to spans entity_spans_array = TokenSpanArray( token_features[span_col_name].values, all_entities["begin"].values, all_entities["end"].values, ) if entity_type_col_name is None: return pd.DataFrame({"span": entity_spans_array}) else: return pd.DataFrame({ "span": entity_spans_array, entity_type_col_name: all_entities["ent_type"].values, })
def extract_dict( tokens: Union[SpanArray, pd.Series], dictionary: pd.DataFrame, output_col_name: str = "match", ): """ Identify all matches of a dictionary on a sequence of tokens. :param tokens: `SpanArray` of token information, optionally wrapped in a `pd.Series`. :param dictionary: The dictionary to match, encoded as a `pd.DataFrame` in the format returned by `load_dict()` :param output_col_name: (optional) name of column of matching spans in the returned DataFrame :return: a single-column DataFrame of token ID spans of dictionary matches """ # Box tokens into a pd.Series if not already boxed. if isinstance(tokens, SpanArray): tokens = pd.Series(tokens) # Wrap the important parts of the tokens series in a temporary dataframe. # noinspection PyUnresolvedReferences toks_tmp = pd.DataFrame({ "token_id": tokens.index, "normalized_text": tokens.array.normalized_covered_text, }) # Start by matching the first token. matches = pd.merge(dictionary, toks_tmp, left_on="toks_0", right_on="normalized_text") matches.rename(columns={"token_id": "begin_token_id"}, inplace=True) matches_col_names = list(matches.columns) # We'll need this later # Check against remaining elements of matching dictionary entries and # accumulate the full set of matches as a list of IntervalIndexes begins_list = [] ends_list = [] max_entry_len = len(dictionary.columns) for match_len in range(1, max_entry_len): # print("Match len: {}".format(match_len)) # Find matches of length match_len. Dictionary entries of this length # will have None in the column "toks_<match_len>". match_locs = pd.isna(matches["toks_{}".format(match_len)]) # print("Completed matches:\n{}".format(matches[match_locs])) match_begins = matches[match_locs]["begin_token_id"].to_numpy() match_ends = match_begins + match_len begins_list.append(match_begins) ends_list.append(match_ends) # For the remaining partial matches against longer dictionary entries, # check the next token by merging with the tokens dataframe. potential_matches = matches[~match_locs].copy() # print("Raw potential matches:\n{}".format(potential_matches)) potential_matches.drop("normalized_text", axis=1, inplace=True) potential_matches["next_token_id"] = ( potential_matches["begin_token_id"] + match_len) potential_matches = pd.merge(potential_matches, toks_tmp, left_on="next_token_id", right_on="token_id") # print("Filtered potential matches:\n{}".format(potential_matches)) potential_matches = potential_matches[ potential_matches["normalized_text"] == potential_matches[ "toks_{}".format(match_len)]] # The result of the join has some extra columns that we don't need. matches = potential_matches[matches_col_names] # Gather together all the sets of matches and wrap in a dataframe. begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) result = pd.DataFrame( {output_col_name: TokenSpanArray(tokens.values, begins, ends)}) # Results are sorted by number of tokens; sort by location instead. result["__begin"] = result[output_col_name].values.begin return result.sort_values("__begin")[[output_col_name]]
def spans_to_iob( token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series], span_ent_types: Union[str, Iterable, np.ndarray, pd.Series] = None ) -> pd.DataFrame: """ Convert a series of `TokenSpan`s of entities to token tags in Inside–Outside–Beginning (IOB2) format. See https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) for more information on IOB2 format. :param token_spans: An object that can be converted to a `TokenSpanArray` via `TokenSpanArray.make_array()`. Should contain `TokenSpan`s aligned with the target tokenization. Usually you create this array by calling `TokenSpanArray.align_to_tokens()`. :param span_ent_types: List of entity type strings corresponding to each of the elements of `token_spans`, or `None` to indicate null entity tags. :return: A `pd.DataFrame` with two columns: * "ent_iob": IOB2 tags as strings "ent_iob" * "ent_type": Entity type strings (or NaN values if `ent_types` is `None`) """ # Normalize inputs token_spans = TokenSpanArray.make_array(token_spans) if span_ent_types is None: span_ent_types = [None] * len(token_spans) elif isinstance(span_ent_types, str): span_ent_types = [span_ent_types] * len(token_spans) elif isinstance(span_ent_types, pd.Series): span_ent_types = span_ent_types.values # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2 iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False) # Handle an empty token span array if len(token_spans) == 0: return pd.DataFrame({ "ent_iob": pd.Series(dtype=iob2_dtype), "ent_type": pd.Series(dtype="string") }) # Initialize an IOB series with all 'O' entities iob_data = np.zeros_like(token_spans.tokens.begin, dtype=np.int64) iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype) # Assign the begin tags iob_tags[token_spans.begin_token] = "B" # Fill in the remaining inside tags i_lengths = token_spans.end_token - (token_spans.begin_token + 1) i_mask = i_lengths > 0 i_begins = token_spans.begin_token[i_mask] + 1 i_ends = token_spans.end_token[i_mask] for begin, end in zip(i_begins, i_ends): iob_tags[begin:end] = "I" # Use a similar process to generate entity type tags ent_types = np.full(len(token_spans.tokens), None, dtype=object) for ent_type, begin, end in zip(span_ent_types, token_spans.begin_token, token_spans.end_token): ent_types[begin:end] = ent_type return pd.DataFrame({ "ent_iob": iob_tags, "ent_type": pd.Series(ent_types, dtype="string") })
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise NotImplementedError( "Arrow serialization for TokenSpanArray is not supported with " "PyArrow versions < 2.0.0") if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field( ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field( ArrowTokenSpanType.ENDS_NAME) # Get the tokens as a dictionary array where indices map to a list of ArrowSpanArrays tokens_dict_array = extension_array.storage.field( ArrowTokenSpanType.TOKENS_NAME) tokens_indices = tokens_dict_array.indices arrow_tokens_arrays_array = tokens_dict_array.dictionary # Breakup the list of ArrowSpanArrays and convert back to individual SpanArrays tokens_arrays = [] span_type = None for i in range(1, len(arrow_tokens_arrays_array.offsets)): start = arrow_tokens_arrays_array.offsets[i - 1].as_py() stop = arrow_tokens_arrays_array.offsets[i].as_py() arrow_tokens_array = arrow_tokens_arrays_array.values[start:stop] # Make an instance of ArrowSpanType if span_type is None: begins_array = arrow_tokens_array.field(ArrowSpanType.BEGINS_NAME) target_text_dict_array = arrow_tokens_array.field( ArrowSpanType.TARGET_TEXT_DICT_NAME) span_type = ArrowSpanType(begins_array.type, target_text_dict_array.type) # Re-make the Arrow extension type to convert back to a SpanArray tokens_array = arrow_to_span( pa.ExtensionArray.from_storage(span_type, arrow_tokens_array)) tokens_arrays.append(tokens_array) # Map the token indices to the actual token SpanArray for each element in the TokenSpanArray tokens = [ _EMPTY_SPAN_ARRAY_SINGLETON if i is None else tokens_arrays[i] for i in tokens_indices.to_pylist() ] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() return TokenSpanArray(tokens, token_begins, token_ends)
# SpaCy tokenizer (only) setup nlp = English() # Create a Tokenizer with the default settings for English # including punctuation rules and exceptions _tokenizer = nlp.tokenizer # Build up some example relations for the tests in this file _TEXT = """ In AD 932, King Arthur and his squire, Patsy, travel throughout Britain searching for men to join the Knights of the Round Table. Along the way, he recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure... """ _TOKENS_SERIES = make_tokens(_TEXT, _tokenizer) _TOKENS_ARRAY = _TOKENS_SERIES.array # type: SpanArray _TOKEN_SPANS_ARRAY = TokenSpanArray.from_char_offsets(_TOKENS_ARRAY) _CAPS_WORD = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[A-Z][a-z]*")) _CAPS_WORDS = extract_regex_tok( _TOKENS_ARRAY, regex.compile("[A-Z][a-z]*(\\s([A-Z][a-z]*))*"), 1, 2 ) _THE = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[Tt]he")) class JoinTest(TestBase): def setUp(self): # Make it easier to see what's going on with join results self._prev_token_offsets_flag_value = TokenSpan.USE_TOKEN_OFFSETS_IN_REPR TokenSpan.USE_TOKEN_OFFSETS_IN_REPR = True def tearDown(self): # Restore TokenSpan repr formatting to avoid messing up other tests.
def arrow_to_token_span(extension_array: pa.ExtensionArray) -> TokenSpanArray: """ Convert a pyarrow.ExtensionArray with type ArrowTokenSpanType to a TokenSpanArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTokenSpanType :return: TokenSpanArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: raise ValueError( "Only pyarrow.Array with a single chunk is supported") extension_array = extension_array.chunk(0) assert pa.types.is_struct(extension_array.storage.type) # Get target text from the begins field metadata and decode string metadata = extension_array.storage.type[ ArrowTokenSpanType.BEGINS_NAME].metadata target_text = metadata[ArrowSpanType.TARGET_TEXT_KEY] if isinstance(target_text, bytes): target_text = target_text.decode() # Get the begins/ends pyarrow arrays token_begins_array = extension_array.storage.field( ArrowTokenSpanType.BEGINS_NAME) token_ends_array = extension_array.storage.field( ArrowTokenSpanType.ENDS_NAME) # Check if CharSpans have been split num_char_span_splits = extension_array.type.num_char_span_splits if num_char_span_splits > 0: char_begins_splits = [] char_ends_splits = [] for i in range(num_char_span_splits): char_begins_splits.append( extension_array.storage.field(ArrowSpanType.BEGINS_NAME + "_{}".format(i))) char_ends_splits.append( extension_array.storage.field(ArrowSpanType.ENDS_NAME + "_{}".format(i))) char_begins_array = pa.concat_arrays(char_begins_splits) char_ends_array = pa.concat_arrays(char_ends_splits) else: char_begins_array = extension_array.storage.field( ArrowSpanType.BEGINS_NAME) char_ends_array = extension_array.storage.field( ArrowSpanType.ENDS_NAME) # Remove any trailing padding if char_begins_array.null_count > 0: char_begins_array = char_begins_array[:-char_begins_array.null_count] char_ends_array = char_ends_array[:-char_ends_array.null_count] # Zero-copy convert arrays to numpy token_begins = token_begins_array.to_numpy() token_ends = token_ends_array.to_numpy() char_begins = char_begins_array.to_numpy() char_ends = char_ends_array.to_numpy() # Create the SpanArray, then the TokenSpanArray char_span = SpanArray(target_text, char_begins, char_ends) return TokenSpanArray(char_span, token_begins, token_ends)
def _doc_to_df(doc: List[_SentenceData], column_names: List[str], iob_columns: List[bool], space_before_punct: bool) -> pd.DataFrame: """ Convert the "Python objects" representation of a document from a CoNLL-2003 file into a `pd.DataFrame` of token metadata. :param doc: List of Python objects that represents the document. :param column_names: Names for the metadata columns that come after the token text. These names will be used to generate the names of the dataframe that this function returns. :param iob_columns: Mask indicating which of the metadata columns after the token text should be treated as being in IOB format. If a column is in IOB format, the returned dataframe will contain *two* columns, holding IOB2 tags and entity type tags, respectively. For example, an input column "ent" will turn into output columns "ent_iob" and "ent_type". :param space_before_punct: If `True`, add whitespace before punctuation characters (and after left parentheses) when reconstructing the text of the document. :return: DataFrame with four columns: * `span`: Span of each token, with character offsets. Backed by the concatenation of the tokens in the document into a single string with one sentence per line. * `ent_iob`: IOB2-format tags of tokens, exactly as they appeared in the original file, with no corrections applied. * `ent_type`: Entity type names for tokens tagged "I" or "B" in the `ent_iob` column; `None` everywhere else. * `line_num`: line number of each token in the parsed file """ # Character offsets of tokens in the reconstructed document begins_list = [] # Type: List[np.ndarray] ends_list = [] # Type: List[np.ndarray] # Reconstructed text of each sentence sentences_list = [] # Type: List[np.ndarray] # Token offsets of sentences containing each token in the document. sentence_begins_list = [] # Type: List[np.ndarray] sentence_ends_list = [] # Type: List[np.ndarray] # Token metadata column values. Key is column name, value is metadata for # each token. meta_lists = _make_empty_meta_values(column_names, iob_columns) # Line numbers of the parsed file for each token in the doc doc_line_nums = [] char_position = 0 token_position = 0 for sentence_num in range(len(doc)): sentence = doc[sentence_num] tokens = sentence.tokens # Don't put spaces before punctuation in the reconstituted string. no_space_before_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_BEFORE_MATCH_FN(tokens)) no_space_after_mask = (np.zeros(len(tokens), dtype=np.bool) if space_before_punct else _SPACE_AFTER_MATCH_FN(tokens)) no_space_before_mask[0] = True # No space before first token no_space_after_mask[-1] = True # No space after last token shifted_no_space_after_mask = np.roll(no_space_after_mask, 1) prefixes = np.where( np.logical_or(no_space_before_mask, shifted_no_space_after_mask), "", " ") string_parts = np.ravel((prefixes, tokens), order="F") sentence_text = "".join(string_parts) sentences_list.append(sentence_text) lengths = np.array([len(t) for t in tokens]) prefix_lengths = np.array([len(p) for p in prefixes]) # Begin and end offsets, accounting for which tokens have spaces # before them. e = np.cumsum(lengths + prefix_lengths) b = e - lengths begins_list.append(b + char_position) ends_list.append(e + char_position) sentence_begin_token = token_position sentence_end_token = token_position + len(e) sentence_begins = np.repeat(sentence_begin_token, len(e)) sentence_ends = np.repeat(sentence_end_token, len(e)) sentence_begins_list.append(sentence_begins) sentence_ends_list.append(sentence_ends) for k in sentence.token_metadata.keys(): meta_lists[k].extend(sentence.token_metadata[k]) char_position += e[-1] + 1 # "+ 1" to account for newline token_position += len(e) doc_line_nums.extend(sentence.line_nums) begins = np.concatenate(begins_list) ends = np.concatenate(ends_list) doc_text = "\n".join(sentences_list) char_spans = SpanArray(doc_text, begins, ends) sentence_spans = TokenSpanArray(char_spans, np.concatenate(sentence_begins_list), np.concatenate(sentence_ends_list)) ret = pd.DataFrame({"span": char_spans}) for k, v in meta_lists.items(): ret[k] = v ret["sentence"] = sentence_spans ret["line_num"] = pd.Series(doc_line_nums) return ret