def perform_targeted_dependency_parsing( spans_to_cover: Union[tp.SpanArray, pd.Series], language_model: spacy.language.Language) -> pd.DataFrame: """ Optimized version of `perform_dependency_parsing` that we introduce in the third part of the series. Identifies regions of the document to parse, then parses a those regions using SpaCy's depdendency parser, then converts the outputs of the parser into a Pandas DataFrame of spans over the original document using Text Extensions for Pandas. """ spans_to_cover = tp.SpanArray.make_array(spans_to_cover) # Special case: No spans. Return empty DataFrame with correct schema. if len(spans_to_cover) == 0: return pd.DataFrame({ "id": pd.Series([], dtype=int), "span": pd.Series([], dtype=tp.SpanDtype()), "tag": pd.Series([], dtype=str), "dep": pd.Series([], dtype=str), "head": pd.Series([], dtype=int), }) return tp.io.spacy.make_tokens_and_features( "", language_model)[["id", "span", "tag", "dep", "head"]] doc_text = spans_to_cover.document_text all_paragraphs = find_paragraph_spans(doc_text) covered_paragraphs = tp.spanner.contain_join(pd.Series(all_paragraphs), pd.Series(spans_to_cover), "paragraph", "span")["paragraph"].array offset = 0 to_stack = [] for paragraph_span in covered_paragraphs: # Tokenize and parse the paragraph paragraph_text = paragraph_span.covered_text paragraph_tokens = tp.io.spacy.make_tokens_and_features( paragraph_text, language_model)[["id", "span", "tag", "dep", "head"]] # Convert token spans to original document text span_array_before = paragraph_tokens["span"].array paragraph_tokens["span"] = \ tp.SpanArray(paragraph_span.target_text, paragraph_span.begin + span_array_before.begin, paragraph_span.begin + span_array_before.end) # Adjust token IDs paragraph_tokens["id"] += offset paragraph_tokens["head"] += offset paragraph_tokens.index += offset to_stack.append(paragraph_tokens) offset += len(paragraph_tokens.index) return pd.concat(to_stack)
def identify_persons_quoted_by_name(named_entities_result, semantic_roles_result=None ) -> pd.DataFrame: """ The second phase of processing from the first part of this series, rolled into a single function. :param named_entities_result: Response object from invoking Watson Natural Language Understanding's named entity model on the document :param semantic_roles_result: Response object from invoking Watson Natural Language Understanding's semantic roles model on the document, or None if the results of the semantic roles model are inside `named_entities_result` :returns: A Pandas DataFrame containing information about potential executives that the document quoted by name """ # Convert the output of Watson Natural Language Understanding to DataFrames. dfs = tp.io.watson.nlu.parse_response(named_entities_result) entity_mentions_df = dfs["entity_mentions"] srl_dfs = (tp.io.watson.nlu.parse_response(semantic_roles_result) if semantic_roles_result is not None else dfs) semantic_roles_df = srl_dfs["semantic_roles"] # Extract mentions of person names and company names person_mentions_df = entity_mentions_df[entity_mentions_df["type"] == "Person"] # Extract instances of subjects that made statements quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] == "say"] subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True) # Identify the locations of subjects within the document. doc_text = entity_mentions_df["span"].array.document_text # Use String.index() to find where the strings in "subject.text" begin begins = [doc_text.index(s) for s in subjects_df["subject.text"]] subjects_df["begin"] = pd.Series(begins, dtype=int) subjects_df[ "end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len() subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"], subjects_df["end"]) # Align subjects with person names execs_df = tp.spanner.contain_join(subjects_df["span"], person_mentions_df["span"], "subject", "person") return execs_df
def find_paragraph_spans(doc_text: str) -> tp.SpanArray: """ Subroutine of perform_targeted_dependency_parsing that we introduce in the third part of the series. Splits document text into paragraphs and returns a SpanArray containing one span per paragraph. """ # Find paragraph boundaries break_locs = [(a.start(), a.end()) for a in regex.finditer(paragraph_break_re, doc_text)] boundaries = break_locs + [(len(doc_text), len(doc_text))] # Split the document on paragraph boundaries begins = [] ends = [] begin = 0 for b in boundaries: end = b[0] if end > begin: # Ignore zero-length paragraphs begins.append(begin) ends.append(end) begin = b[1] return tp.SpanArray(doc_text, begins, ends)