예제 #1
0
def make_tokens_and_features(
    target_text: str, language_model, add_left_and_right=False,
) -> pd.DataFrame:
    """
    :param target_text: Text to analyze
    :param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
     object
    :param add_left_and_right: If ``True``, add columns "left" and "right"
     containing references to previous and next tokens.

    :return: A tuple of two dataframes:

             1. The tokens of the text plus additional linguistic features that the
                language model generates, represented as a `pd.DataFrame`.
             2. A table of named entities identified by the language model's named entity
                tagger, represented as a `pd.DataFrame`.
    """
    spacy_doc = language_model(target_text)

    # TODO: Performance tuning of the translation code that follows
    # Represent the character spans of the tokens
    tok_begins = np.array([t.idx for t in spacy_doc])
    tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
    tokens_array = SpanArray(target_text, tok_begins, tok_ends)
    tokens_series = pd.Series(tokens_array)
    # Also build single-token token-based spans to make it easier to build
    # larger token-based spans.
    token_spans = TokenSpanArray.from_char_offsets(tokens_series.array)
    # spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
    # the offset of the first character in the token). Translate from these
    # to a dense range of integer IDs that will correspond to the index of our
    # returned DataFrame.
    idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
    # Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
    iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
    df_cols = {
        "id": range(len(tok_begins)),
        "span": tokens_series,
        "lemma": [t.lemma_ for t in spacy_doc],
        "pos": pd.Categorical([str(t.pos_) for t in spacy_doc]),
        "tag": pd.Categorical([str(t.tag_) for t in spacy_doc]),
        "dep": pd.Categorical([str(t.dep_) for t in spacy_doc]),
        "head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
        "shape": pd.Categorical([t.shape_ for t in spacy_doc]),
        "ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc],
                                  dtype=iob2_dtype),
        "ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
        "is_alpha": np.array([t.is_alpha for t in spacy_doc]),
        "is_stop": np.array([t.is_stop for t in spacy_doc]),
        "sentence": _make_sentences_series(spacy_doc, tokens_array),
    }
    if add_left_and_right:
        # Use nullable int type because these columns contain nulls
        df_cols["left"] = pd.array(
            [None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
        )
        df_cols["right"] = pd.array(
            list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
        )
    return pd.DataFrame(df_cols)
예제 #2
0
# SpaCy tokenizer (only) setup
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
_tokenizer = nlp.tokenizer

# Build up some example relations for the tests in this file
_TEXT = """
In AD 932, King Arthur and his squire, Patsy, travel throughout Britain 
searching for men to join the Knights of the Round Table. Along the way, he 
recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure...
"""
_TOKENS_SERIES = make_tokens(_TEXT, _tokenizer)
_TOKENS_ARRAY = _TOKENS_SERIES.array  # type: SpanArray
_TOKEN_SPANS_ARRAY = TokenSpanArray.from_char_offsets(_TOKENS_ARRAY)
_CAPS_WORD = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[A-Z][a-z]*"))
_CAPS_WORDS = extract_regex_tok(
    _TOKENS_ARRAY, regex.compile("[A-Z][a-z]*(\\s([A-Z][a-z]*))*"), 1, 2
)
_THE = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[Tt]he"))


class JoinTest(TestBase):
    def setUp(self):
        # Make it easier to see what's going on with join results
        self._prev_token_offsets_flag_value = TokenSpan.USE_TOKEN_OFFSETS_IN_REPR
        TokenSpan.USE_TOKEN_OFFSETS_IN_REPR = True

    def tearDown(self):
        # Restore TokenSpan repr formatting to avoid messing up other tests.