示例#1
0
def perform_targeted_dependency_parsing(
        spans_to_cover: Union[tp.SpanArray, pd.Series],
        language_model: spacy.language.Language) -> pd.DataFrame:
    """
    Optimized version of `perform_dependency_parsing` that we introduce in the
    third part of the series.
    
    Identifies regions of the document to parse, then parses a those regions
    using SpaCy's depdendency parser, then converts the outputs of the parser 
    into a Pandas DataFrame of spans over the original document using Text 
    Extensions for Pandas.
    """
    spans_to_cover = tp.SpanArray.make_array(spans_to_cover)

    # Special case: No spans. Return empty DataFrame with correct schema.
    if len(spans_to_cover) == 0:
        return pd.DataFrame({
            "id": pd.Series([], dtype=int),
            "span": pd.Series([], dtype=tp.SpanDtype()),
            "tag": pd.Series([], dtype=str),
            "dep": pd.Series([], dtype=str),
            "head": pd.Series([], dtype=int),
        })
        return tp.io.spacy.make_tokens_and_features(
            "", language_model)[["id", "span", "tag", "dep", "head"]]

    doc_text = spans_to_cover.document_text
    all_paragraphs = find_paragraph_spans(doc_text)
    covered_paragraphs = tp.spanner.contain_join(pd.Series(all_paragraphs),
                                                 pd.Series(spans_to_cover),
                                                 "paragraph",
                                                 "span")["paragraph"].array

    offset = 0
    to_stack = []
    for paragraph_span in covered_paragraphs:
        # Tokenize and parse the paragraph
        paragraph_text = paragraph_span.covered_text
        paragraph_tokens = tp.io.spacy.make_tokens_and_features(
            paragraph_text,
            language_model)[["id", "span", "tag", "dep", "head"]]

        # Convert token spans to original document text
        span_array_before = paragraph_tokens["span"].array
        paragraph_tokens["span"] = \
            tp.SpanArray(paragraph_span.target_text,
                         paragraph_span.begin + span_array_before.begin,
                         paragraph_span.begin + span_array_before.end)

        # Adjust token IDs
        paragraph_tokens["id"] += offset
        paragraph_tokens["head"] += offset
        paragraph_tokens.index += offset

        to_stack.append(paragraph_tokens)
        offset += len(paragraph_tokens.index)
    return pd.concat(to_stack)
示例#2
0
def identify_persons_quoted_by_name(named_entities_result,
                                    semantic_roles_result=None
                                    ) -> pd.DataFrame:
    """
    The second phase of processing from the first part of this series, rolled into 
    a single function.
    
    :param named_entities_result: Response object from invoking Watson Natural Language
     Understanding's named entity model on the document
     
    :param semantic_roles_result: Response object from invoking Watson Natural Language
     Understanding's semantic roles model on the document, or None if the results of
     the semantic roles model are inside `named_entities_result`
    
    :returns: A Pandas DataFrame containing information about potential executives
     that the document quoted by name
    """

    # Convert the output of Watson Natural Language Understanding to DataFrames.
    dfs = tp.io.watson.nlu.parse_response(named_entities_result)
    entity_mentions_df = dfs["entity_mentions"]

    srl_dfs = (tp.io.watson.nlu.parse_response(semantic_roles_result)
               if semantic_roles_result is not None else dfs)
    semantic_roles_df = srl_dfs["semantic_roles"]

    # Extract mentions of person names and company names
    person_mentions_df = entity_mentions_df[entity_mentions_df["type"] ==
                                            "Person"]

    # Extract instances of subjects that made statements
    quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] ==
                                  "say"]
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Identify the locations of subjects within the document.
    doc_text = entity_mentions_df["span"].array.document_text

    # Use String.index() to find where the strings in "subject.text" begin
    begins = [doc_text.index(s) for s in subjects_df["subject.text"]]
    subjects_df["begin"] = pd.Series(begins, dtype=int)
    subjects_df[
        "end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len()
    subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"],
                                       subjects_df["end"])

    # Align subjects with person names
    execs_df = tp.spanner.contain_join(subjects_df["span"],
                                       person_mentions_df["span"], "subject",
                                       "person")
    return execs_df
示例#3
0
def find_paragraph_spans(doc_text: str) -> tp.SpanArray:
    """
    Subroutine of perform_targeted_dependency_parsing that we introduce 
    in the third part of the series. Splits document text into paragraphs
    and returns a SpanArray containing one span per paragraph.
    """
    # Find paragraph boundaries
    break_locs = [(a.start(), a.end())
                  for a in regex.finditer(paragraph_break_re, doc_text)]
    boundaries = break_locs + [(len(doc_text), len(doc_text))]

    # Split the document on paragraph boundaries
    begins = []
    ends = []
    begin = 0
    for b in boundaries:
        end = b[0]
        if end > begin:  # Ignore zero-length paragraphs
            begins.append(begin)
            ends.append(end)
        begin = b[1]
    return tp.SpanArray(doc_text, begins, ends)