def _make_syntax_dataframes(syntax_response, original_text): tokens = syntax_response.get("tokens", []) sentence = syntax_response.get("sentences", []) if len(tokens) > 0: token_table = util.make_table(tokens) location_col, location_name = util.find_column(token_table, "location") text_col, text_name = util.find_column(token_table, "text") char_span = util.make_char_span(location_col, text_col, original_text) # Drop location, text columns that is duplicated in char_span token_table = token_table.drop([location_name, text_name]) # Add the span columns to the DataFrames token_df = token_table.to_pandas() token_df['span'] = char_span else: char_span = None token_df = pd.DataFrame() if len(sentence) > 0: sentence_table = util.make_table(sentence) sentence_df = sentence_table.to_pandas() if char_span is not None: location_col, _ = util.find_column(sentence_table, "location") text_col, _ = util.find_column(sentence_table, "text") sentence_char_span = util.make_char_span(location_col, text_col, original_text) sentence_span = TokenSpanArray.align_to_tokens( char_span, sentence_char_span) sentence_df['span'] = sentence_char_span sentence_df['sentence_span'] = sentence_span else: sentence_df = pd.DataFrame() return token_df, sentence_df
def _make_entity_dataframes( entities: List, original_text: str) -> (pd.DataFrame, pd.DataFrame): """ Create the entities and entity_mentions DataFrames. :param entities: The "entities" section of a parsed NLU response :param original_text: Text of the document. This argument must be provided if there are entity mention spans. """ if len(entities) == 0: return pd.DataFrame(), pd.DataFrame() table = util.make_table(entities) # Check if response includes entity mentions mention_name_cols = [(name, table.column(name)) for name in table.column_names if name.lower().startswith("mentions")] # Make entities and entity mentions (optional) DataFrames if len(mention_name_cols) > 0: mention_names, mention_cols = zip(*mention_name_cols) # Create the entities DataFrame with mention arrays dropped table = table.drop(mention_names) pdf = table.to_pandas() # Flatten the mention arrays to be put in separate table mention_arrays = [ pa.concat_arrays(col.iterchunks()) for col in mention_cols ] flat_mention_arrays = [a.flatten() for a in mention_arrays] table_mentions = pa.Table.from_arrays(flat_mention_arrays, names=mention_names) # Convert location/text columns to span location_col, location_name = util.find_column(table_mentions, "location") text_col, text_name = util.find_column(table_mentions, "text") if original_text is None: raise ValueError( "Unable to construct target text for converting entity mentions to spans" ) char_span = util.make_char_span(location_col, text_col, original_text) table_mentions = table_mentions.drop([location_name, text_name]) # Create the entity_mentions DataFrame pdf_mentions = table_mentions.to_pandas() pdf_mentions["span"] = char_span # Align index of parent entities DataFrame with flattened DataFrame and ffill # values mention_offsets = mention_arrays[0].offsets.to_numpy() pdf_parent = pdf.set_index(mention_offsets[:-1]) pdf_parent = pdf_parent.reindex(pdf_mentions.index, method="ffill") # Add columns from entities parent DataFrame pdf_mentions["text"] = pdf_parent["text"] pdf_mentions["type"] = pdf_parent["type"] # Remove "mentions" from column names pdf_mentions.rename(columns={ c: c.split("mentions.")[-1] for c in pdf_mentions.columns }, inplace=True) else: pdf = table.to_pandas() pdf_mentions = pd.DataFrame() return pdf, pdf_mentions
def _make_relations_dataframe_zero_copy(relations): if len(relations) == 0: return pd.DataFrame() table = util.make_table(relations) # Separate each argument into a column flattened_arguments = [] drop_cols = [] for name in table.column_names: if name.lower().startswith("arguments"): col = pa.concat_arrays(table.column(name).iterchunks()) assert pa.types.is_list(col.type) is_nested_list = pa.types.is_list(col.type.value_type) name_split = name.split('.', maxsplit=1) first_list = col[0] num_arguments = len(first_list) null_count = 0 # Get the flattened raw values raw = col offset_arrays = [] while pa.types.is_list(raw.type): offset_arrays.append(raw.offsets) null_count += raw.null_count raw = raw.flatten() # TODO handle lists with null values if null_count > 0: continue # Convert values to numpy values = raw.to_numpy(zero_copy_only=False) # string might copy offsets_list = [o.to_numpy() for o in offset_arrays] # Compute the length of each list in the array value_offsets = offsets_list.pop() value_lengths = value_offsets[1:] - value_offsets[:-1] # Separate the arguments into individual columns for i in range(num_arguments): arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1]) arg_lengths = value_lengths[i::num_arguments] # Fixed length arrays can be sliced if not is_nested_list or len(np.unique(arg_lengths)) == 1: num_elements = len(first_list[i]) if is_nested_list else 1 # Only 1 element so leave in primitive array if not is_nested_list or num_elements == 1: arg_values = values[i::num_arguments] arg_array = pa.array(arg_values) # Multiple elements so put back in a list array else: arg_values = values.reshape( [len(col) * num_arguments, num_elements]) arg_values = arg_values[i::num_elements] arg_values = arg_values.flatten() arg_offsets = np.cumsum(arg_lengths) arg_offsets = np.insert(arg_offsets, 0, 0) arg_array = pa.ListArray.from_arrays( arg_offsets, arg_values) else: # TODO Argument properties with variable length arrays not currently # supported continue flattened_arguments.append((arg_array, arg_name)) drop_cols.append(name) # Add the flattened argument columns for arg_array, arg_name in flattened_arguments: table = table.append_column(arg_name, arg_array) # Drop columns that have been flattened table = table.drop(drop_cols) return table.to_pandas()
def _make_relations_dataframe(relations, original_text, sentence_span_series): if len(relations) == 0: return pd.DataFrame() table = util.make_table(relations) location_cols = { } # Type: Dict[int, Tuple[Union[Array, ChunkedArray], str]] # Separate each argument into a column flattened_arguments = [] drop_cols = [] for name in table.column_names: if name.lower().startswith("arguments"): col = pa.concat_arrays(table.column(name).iterchunks()) assert pa.types.is_list(col.type) name_split = name.split('.', maxsplit=1) num_arguments = len(col[0]) value_series = col.values.to_pandas() # Separate the arguments into individual columns for i in range(num_arguments): arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1]) arg_series = value_series[i::num_arguments] arg_array = pa.array(arg_series) # If list array is fixed length with 1 element, it can be flattened temp = arg_array while pa.types.is_list(temp.type): temp = temp.flatten() if len(temp) == len(arg_array): # TODO also need to verify each offset inc by 1? arg_array = temp if name.lower().endswith("location"): location_cols[i] = (arg_array, "{}.{}".format(name_split[0], i)) flattened_arguments.append((arg_array, arg_name)) drop_cols.append(name) # Add the flattened argument columns for arg_array, arg_name in flattened_arguments: table = table.append_column(arg_name, arg_array) # Replace argument location and text columns with spans arg_span_cols = {} for arg_i, (location_col, arg_prefix) in location_cols.items(): text_col, text_name = util.find_column(table, "{}.text".format(arg_prefix)) arg_span_cols["{}.span".format(arg_prefix)] = util.make_char_span( location_col, text_col, original_text) drop_cols.extend(["{}.location".format(arg_prefix), text_name]) add_cols = arg_span_cols.copy() # Build the sentence span and drop plain text sentence col sentence_col, sentence_name = util.find_column(table, "sentence") arg_col_names = list(arg_span_cols.keys()) if len(arg_col_names) > 0: first_arg_span_array = arg_span_cols[arg_col_names[0]] sentence_matches = [] for i, arg_span in enumerate(first_arg_span_array): arg_begin = arg_span.begin arg_end = arg_span.end j = len(sentence_span_series) // 2 found = False while not found: sentence_span = sentence_span_series[j] if arg_begin >= sentence_span.end: j += 1 elif arg_end <= sentence_span.begin: j -= 1 else: contains = [ sentence_span.contains(a[i]) for a in arg_span_cols.values() ] if not (all(contains) and sentence_span.covered_text == sentence_col[i].as_py()): msg = f"Mismatched sentence span for: {sentence_span}" if not all(contains): msg += f"\nContains Args: {all(contains)}" if sentence_span.covered_text != sentence_col[i].as_py( ): msg += f"\nSpanText: '{sentence_span.covered_text}'" \ f"\nSentence: '{sentence_col[i]}'" warnings.warn(msg) sentence_matches.append(j) found = True relations_sentence = sentence_span_series[sentence_matches] add_cols["sentence_span"] = relations_sentence.reset_index(drop=True) drop_cols.append(sentence_name) else: warnings.warn("Could not make sentence span column for Re") # Drop columns that have been flattened or replaced by spans table = table.drop(drop_cols) df = table.to_pandas() # Insert additional columns for col_name, col in add_cols.items(): df[col_name] = col return df