def query(self, query, names=None, propagate=False, keyerror='ignore'): if names is None: names = list(self.dfs.keys()) elif isinstance(names, str): names = [names] dfs = dict(self.dfs) seen = set() for name1 in names: if name1 not in seen: try: dfs[name1] = dfs[name1].query(query) seen.add(name1) except UndefinedVariableError: if keyerror == "ignore": continue else: raise if propagate: for name2, df2 in dfs.items(): if name2 not in seen: dfs[name2] = merge_with_spans(dfs[name1][[ c for c in dfs[name1].columns if c in df2.columns and c.endswith("_id") ]], df2, how="inner") seen.add(name2) # main_df = self.dfs[self.main].query(query).reset_index(drop=True) # dfs = [main_df, *((merge_with_spans(main_df[[c for c in main_df.columns if c in df.columns and c.endswith("_id")]], df, how="inner") # if df is not None else None) # for df in list(self.dfs.values())[1:])] return Dataset(**{key: df.copy() for key, df in dfs.items()})
def apply_deltas(positions, deltas, on, position_columns=None): if not isinstance(on, (tuple, list)): on = [on] if position_columns is None: position_columns = {'begin': 'left', 'end': 'right'} positions = positions.copy() positions['_id_col'] = np.arange(len(positions)) mention_deltas = merge_with_spans( positions[[*position_columns, *on, '_id_col']], deltas, on=on, suffixes=('_pos', '_delta'), how='inner') # To be faster, we remove categorical columns (they may only be in 'on') before the remaining ops mention_deltas = mention_deltas[[ c for c in mention_deltas.columns if c not in on ]] positions = positions.set_index('_id_col') mention_deltas = mention_deltas.set_index('_id_col') delta_col_map, positions_col_map = make_merged_names_map( deltas.columns, [*position_columns, *on, '_id_col'], left_on=on, right_on=on, suffixes=('_delta', '_pos')) for col, side in position_columns.items(): mention_deltas.eval( f"shift = ({delta_col_map['end']} <= {positions_col_map[col]}) * {delta_col_map['delta']}", inplace=True) mention_deltas.eval( f"between_magnet = {delta_col_map['begin']} < {positions_col_map[col]} and {positions_col_map[col]} < {delta_col_map['end']}", inplace=True) if side == "left": mention_deltas.eval( f"between_magnet = between_magnet * ({delta_col_map['begin']} - {positions_col_map[col]})", inplace=True) elif side == "right": mention_deltas.eval( f"between_magnet = between_magnet * ({delta_col_map['end']} + {delta_col_map['delta']} - {positions_col_map[col]})", inplace=True) order = "first" if side == "left" else "last" tmp = mention_deltas.sort_values([ '_id_col', delta_col_map['begin' if side == 'left' else 'end'] ]).groupby('_id_col').agg({ "shift": "sum", **{ n: order for n in mention_deltas.columns if n not in ("shift", "_id_col") } }) positions[col] = positions[col].add(tmp['shift'] + tmp['between_magnet'], fill_value=0) positions = positions.reset_index(drop=True) return positions
def take(self, ids): main_df = self.dfs[self.main].iloc[ids].reset_index(drop=True) dfs = [ main_df, *((merge_with_spans(main_df[[ c for c in main_df.columns if c in df.columns and c.endswith('_id') ]], df, how="inner") if df is not None else None) for df in list(self.dfs.values())[1:]) ] return Dataset(**dict(zip(self.dfs.keys(), dfs)))
def load_n2c2_2019_task3(validation_split=0.2, random_state=42, split="train"): path = env.resource("n2c2/".format(split)) dataset = [] for filename in sorted(os.listdir(path / '{}_norm'.format(split))): if filename.endswith('.norm'): with open(path / '{}_norm'.format(split) / filename) as f: for line in f: (mention_id, label, *spans) = line.strip('\n').split('||') begins, ends = [int(b) for b in spans[::2] ], [int(e) for e in spans[1::2]] dataset.append({ "doc_id": filename.replace('.norm', ''), "mention_id": mention_id, "label": label, "begin": begins, "end": ends, }) texts = [] for filename in sorted(os.listdir(path / '{}_note'.format(split))): if filename.endswith('.txt'): with open(path / '{}_note'.format(split) / filename) as f: texts.append({ "doc_id": filename.replace('.txt', ''), "text": f.read().strip('\n') }) with open(path / '{}_file_list.txt'.format(split)) as f: train_files = pd.Series([n.strip('\n') for n in f.readlines()]) train_files.name = 'doc_id' docs = merge_with_spans(train_files, pd.DataFrame(texts), on='doc_id') rng = check_random_state(random_state) if split == "train": docs['split'] = rng.choice(['train', 'val'], size=len(docs), p=[1 - validation_split, validation_split]) else: docs['split'] = 'test' mentions = pd.DataFrame(dataset) fragments = mentions[['doc_id', 'mention_id', 'begin', 'end']].nlstruct.flatten("fragment_id", tile_index=False).astype( {"fragment_id": object}) return Dataset(docs=docs[["doc_id", "text", "split"]], mentions=mentions[["doc_id", "mention_id", "label"]], fragments=fragments[[ "doc_id", "mention_id", "fragment_id", "begin", "end" ]])
def merge_pred_and_gold(pred, gold, on=('doc_id', ('begin', 'end'), 'label'), span_policy='partial_strict', atom_pred_level=None, atom_gold_level=None, suffixes=('_pred', '_gold')): """ Performs an outer merge between pred and gold that can be in 3 configurations: - (pred == nan, gold != nan) => pred_count = 0, gold_count = 1, tp = 0 - (pred != nan, gold == nan) => pred_count = 1, gold_count = 0, tp = 0 - (pred != nan, gold != nan) => pred_count = 1, gold_count = 1, tp = 1 How the merge is done is by trying to merge on the columns given in "on" and using the span policy "policy" to merge spans Parameters ---------- pred: pd.DataFrame gold: pd.DataFrame on: typing.Sequence of (str or tuple) span_policy: str atom_pred_level: (typing.Sequence of str) or str atom_gold_level: (typing.Sequence of str) or str Returns ------- pd.DataFrame """ delete_atom_pred_level = delete_atom_gold_level = False if isinstance(atom_pred_level, (list, tuple)): pred = pred.assign(_pred_id=pred[atom_pred_level].nlstruct.factorize()) atom_pred_level = '_pred_id' delete_atom_pred_level = True elif atom_pred_level is None: pred = pred.assign(_pred_id=np.arange(len(pred))) atom_pred_level = '_pred_id' delete_atom_pred_level = True if isinstance(atom_gold_level, (list, tuple)): gold = gold.assign(_gold_id=gold[atom_gold_level].nlstruct.factorize()) atom_gold_level = '_gold_id' delete_atom_gold_level = True elif atom_gold_level is None: gold = gold.assign(_gold_id=np.arange(len(gold))) atom_gold_level = '_gold_id' delete_atom_gold_level = True # pred_names, gold_names = make_merged_names(pred.columns, gold.columns, left_on=on, right_on=on, # left_columns=pred.columns, right_columns=gold.columns) # pred_names_map = dict(zip(pred.columns, pred_names)) # gold_names_map = dict(zip(gold.columns, gold_names)) # categoricals = {} # for col in pred.columns: # if hasattr(pred[col], 'cat'): # categoricals[pred_names_map[col]] = pred[col].cat.categories # pred[col] = pred[col].cat.codes # for col in gold.columns: # if hasattr(gold[col], 'cat'): # categoricals[gold_names_map[col]] = gold[col].cat.categories # gold[col] = gold[col].cat.codes merged = merge_with_spans(pred, gold, on=on, how='inner', span_policy=span_policy, suffixes=suffixes) overlap_size_names = [ c for c in merged.columns if c.startswith("overlap_size_") ] merged = merged.groupby( [atom_pred_level, atom_gold_level], as_index=False, observed=True ).agg({ **{n: 'sum' for n in overlap_size_names}, **{ n: 'first' for n in merged.columns if n not in (*overlap_size_names, atom_pred_level, atom_gold_level) } }) if overlap_size_names: merged = merged.sort_values(overlap_size_names) res = None if not len(merged): res = merged.iloc[:0] while len(merged): tmp = merged tmp = tmp.groupby(atom_gold_level, as_index=False, observed=True).last() tmp = tmp.groupby(atom_pred_level, as_index=False, observed=True).last() res = res.append(tmp) if res is not None else tmp merged = merged[np.logical_and( ~merged[atom_pred_level].isin(res[atom_pred_level]), ~merged[atom_gold_level].isin(res[atom_gold_level]))] pred = pred.groupby([atom_pred_level], as_index=False, observed=True).last() gold = gold.groupby([atom_gold_level], as_index=False, observed=True).last() res = pd.concat( (res, pred[~pred[atom_pred_level].isin(res[atom_pred_level])][list( set(res.columns) & set(pred.columns))], gold[~gold[atom_gold_level].isin(res[atom_gold_level])][list( set(res.columns) & set(gold.columns))]), sort=False) # for col, categories in categoricals.items(): # res[col] = pd.Categorical.from_codes(res[col].fillna(-1).astype(int), categories=categories) res['pred_count'] = (~res[atom_pred_level].isnull()).astype(int) res['gold_count'] = (~res[atom_gold_level].isnull()).astype(int) res['tp'] = res['pred_count'] * res['gold_count'] res['root'] = 0 res = res.drop( columns=(([atom_pred_level] if delete_atom_pred_level else []) + ([atom_gold_level] if delete_atom_gold_level else []))) return res
def partition_spans(smalls, large, overlap_policy="merge_large", new_id_name="sample_id", span_policy="partial_strict"): """ Parameters ---------- smalls: pd.DataFrame[begin, end, ...] Ex: tokens large: pd.DataFrame[begin, end, ...] Ex: sentences overlap_policy: str or bool One of - merge_large: Keeps small untouched but merges large spans that overlap the same small span ex: partition_spans(mentions, sentences) -> merges sentences - small_to_leftmost_large: Keeps small and large untouched, and assigns small to the leftmost large that overlaps it ex: partition_spans(tokens, mentions) -> assign token to the leftmost mention that touches it - small_to_rightmost_large: Keeps small and large untouched, and assigns small to the rightmost large that overlaps it ex: partition_spans(tokens, mentions) -> assign token to the rightmost mention that touches it - small_to_biggest_overlap_large: keeps small and large untouched, and assigns small to the large span that overlaps it the most ex: partition_spans(tokens, mentions) -> assign token to the mention that overlaps it the most - False do nothing and allow multiple matchings between small and large new_id_name: str If overlap_policy == "merge_large", this is the column that will host the newly created ids per merge span_policy: Which policy to use to detect span overlaps Returns ------- """ assert overlap_policy in ( "merge_large", "split_small", "small_to_leftmost_large", "small_to_rightmost_large", "small_to_biggest_overlap_large", False), f"Unknown small overlap policy '{overlap_policy}'" if not isinstance(smalls, (list, tuple)): smalls = [smalls] merged_id_cols = doc_id_cols = None if overlap_policy == "merge_large": original_new_id_name = new_id_name while new_id_name in large.columns: new_id_name = "_" + new_id_name large = large.copy() old_to_new = None has_created_new_id_col = False for small in smalls: doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids( large, small) large_id_cols = [c for c in large_id_cols] # Merge sentences and mentions merged = merge_with_spans(small, large, span_policy=span_policy, how='right', on=[*doc_id_cols, ("begin", "end")]) # If a mention overlap multiple sentences, assign it to the last sentence small_ids = merged[doc_id_cols + small_id_cols].nlstruct.factorize( group_nans=False) if has_created_new_id_col: large_ids = merged[doc_id_cols + [new_id_name]].nlstruct.factorize( group_nans=False) else: large_ids = merged[doc_id_cols + large_id_cols].nlstruct.factorize( group_nans=False) merged[new_id_name] = make_id_from_merged(large_ids, small_ids, apply_on=[(0, large_ids) ])[0] merged["begin"] = merged[['begin_x', 'begin_y']].min(axis=1) merged["end"] = merged[['end_x', 'end_y']].max(axis=1) large = (merged.groupby( new_id_name, as_index=False, observed=True ).agg({ **{ n: 'first' for n in [*doc_id_cols, *large_id_cols] if n != new_id_name }, 'begin': 'min', 'end': 'max' }).astype({ "begin": int, "end": int, **large[doc_id_cols].dtypes })) large = large[doc_id_cols + [new_id_name] + ["begin", "end"]] large[new_id_name] = large['begin'] large = large.nlstruct.groupby_assign( doc_id_cols, {new_id_name: lambda x: tuple(np.argsort(np.argsort(x)))}) old_to_new = large[doc_id_cols + [new_id_name]].drop_duplicates().reset_index( drop=True) merged_id_cols = [new_id_name] # large[original_new_id_name] = large[doc_id_cols + [new_id_name]].apply(lambda x: "/".join(map(str, x[doc_id_cols])) + "/" + str(x[new_id_name]), axis=1).astype("category") # large = large.drop(columns={*doc_id_cols, new_id_name} - {original_new_id_name}) else: original_new_id_name = None # merged = merged.drop_duplicates([*doc_id_cols, *small_id_cols], keep=overlap_policy) doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids( large, smalls[0]) merged_id_cols = large_id_cols new_id_name = None old_to_new = None # Merge sentences and mentions new_smalls = [] for small in smalls: doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids( large, small) merged = merge_with_spans(small, large[doc_id_cols + large_id_cols + ['begin', 'end']], how='inner', span_policy=span_policy, on=[*doc_id_cols, ("begin", "end")]) if overlap_policy == "small_to_biggest_overlap_large": merged = merged.sort_values([ *doc_id_cols, *small_id_cols, 'overlap_size_0' ]).drop_duplicates([*doc_id_cols, *small_id_cols], keep="last") elif overlap_policy == "small_to_leftmost_large": merged = merged.sort_values( [*doc_id_cols, *small_id_cols, 'begin_y']).drop_duplicates([*doc_id_cols, *small_id_cols], keep="first") elif overlap_policy == "small_to_rightmost_large": merged = merged.sort_values( [*doc_id_cols, *small_id_cols, 'begin_y']).drop_duplicates([*doc_id_cols, *small_id_cols], keep="last") elif overlap_policy == "split_small": merged = merged.assign(begin_x=np.maximum(merged['begin_x'], merged['begin_y']), end_x=np.minimum(merged['end_x'], merged['end_y'])) new_small = (merged.assign( begin=merged["begin_x"] - merged["begin_y"], end=merged["end_x"] - merged["begin_y"]).astype({ "begin": int, "end": int })[[ *doc_id_cols, *(merged_id_cols or ()), *small_id_cols, *small_val_cols, "begin", "end" ]]) if new_id_name: new_small[new_id_name] = new_small[new_id_name].astype(str) new_small[new_id_name] = new_small[new_id_name].str.zfill( new_small[new_id_name].str.len().max()) new_small[original_new_id_name] = join_cols( new_small[doc_id_cols + ( [new_id_name] if new_id_name not in doc_id_cols else [])], "/") new_small = new_small.drop(columns={*doc_id_cols, new_id_name} - {original_new_id_name}) new_smalls.append(new_small) if original_new_id_name: if new_id_name: large[new_id_name] = large[new_id_name].astype(str) large[new_id_name] = large[new_id_name].str.zfill( large[new_id_name].str.len().max()) large[original_new_id_name] = join_cols( large[doc_id_cols + [new_id_name]], "/") large = large.drop(columns={*doc_id_cols, new_id_name} - {original_new_id_name}) new_doc_id_cols = [ c if c != original_new_id_name else f'_{c}' for c in doc_id_cols ] old_to_new[new_id_name] = old_to_new[new_id_name].astype(str) old_to_new[new_id_name] = old_to_new[new_id_name].str.zfill( old_to_new[new_id_name].str.len().max()) ( old_to_new[original_new_id_name], old_to_new[new_doc_id_cols], ) = ( # old_to_new[doc_id_cols + [new_id_name]].apply(lambda x: "/".join(map(str, x[doc_id_cols])) + "/" + str(x[new_id_name]), axis=1), join_cols(old_to_new[doc_id_cols + [new_id_name]], "/"), old_to_new[doc_id_cols]) if new_id_name not in (*new_doc_id_cols, original_new_id_name): del old_to_new[new_id_name] new_smalls = [ small.astype( {original_new_id_name: large[original_new_id_name].dtype}) for small in new_smalls ] return new_smalls, large, old_to_new
def encode_as_tag(small, large, label_cols=None, tag_names=None, tag_scheme="bio", use_token_idx=False, verbose=0, groupby=None): """ Parameters ---------- tag_names: str or list of str tag name that will be created for each label tag_scheme: str BIO/BIOUL tagging scheme small: tokens large: mentions label_cols: "label" use_token_idx: Use token pos instead of char spans, defaults to False verbose: int If verbose > 0, make progress bar Returns ------- pd.DataFrame """ assert tag_scheme in ("bio", "bioul", "raw") doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids( large, small) # assert len(large_val_cols) < 2, "Cannot encode more than one column as tags" assert len(large_val_cols) > 0, "Must have a column to encode as tags" if label_cols is None: label_cols = large_val_cols if isinstance(label_cols, str): label_cols = [label_cols] if tag_names is None: tag_names = label_cols if isinstance(tag_names, str): tag_names = [tag_names] label_categories = {} # Map mentions to small as a tag large = large.sort_values([*doc_id_cols, "begin", "end"]) for label, mentions_of_group in (large.groupby( groupby, as_index=False, observed=True) if groupby is not None else [(None, large)]): assert label not in large_val_cols, f"Cannot groupby {label} value because there is already a column with this name" group_tag_names = [ "/".join(s for s in (label, tag_name) if s is not None) for tag_name in tag_names ] if use_token_idx: merged = merge_with_spans( mentions_of_group, small[[ *doc_id_cols, *small_id_cols, *(c for c in small_val_cols if c != "token_idx"), "token_idx" ]], on=doc_id_cols, suffixes=('_large', '')).query("begin <= token_idx and token_idx < end") else: merged = merge_with_spans(mentions_of_group, small, span_policy='partial_strict', on=[*doc_id_cols, ("begin", "end")], suffixes=('_large', '')) # If a token overlap multiple mentions, assign it to the last mention len_before = len(merged) merged = merged.drop_duplicates([*doc_id_cols, *small_id_cols], keep='last') if len_before - len(merged) > 0: warn( f"Dropped {len_before-len(merged)} duplicated tags caused by overlapping mentions" ) merged_id_cols = doc_id_cols + large_id_cols + small_id_cols # Encode mention labels as a tag tags = (merged[merged_id_cols + label_cols].sort_values(merged_id_cols)) if tag_scheme != "raw": keep_cols = list( set(doc_id_cols + large_id_cols) - set(label_cols)) tags = ( # convert all categorical dtypes of group cols as simple types (np.str, np.int, np.object...) # to accelerate concatenation inside the groupby tags.astype({ k: dtype if not hasattr(dtype, 'categories') else dtype.categories.dtype for k, dtype in tags.dtypes[keep_cols].items() }).rename(dict(zip(label_cols, group_tag_names)), axis=1).nlstruct.groupby_assign( doc_id_cols + large_id_cols, { tag_name: lambda labels: make_tag_scheme( len(labels), labels.iloc[0], tag_scheme) for tag_name, label_col in zip( group_tag_names, label_cols) }) # convert back each group column dtype to its origial categorical dtype .astype(tags.dtypes[keep_cols]) [doc_id_cols + small_id_cols + group_tag_names]) # merged = merged[[*merged_id_cols, *small_val_cols, "begin", "end"]].merge(tags) small = small.merge(tags, on=doc_id_cols + small_id_cols, how="left") if tag_scheme != "raw": try: for tag_name, label_col in zip(group_tag_names, label_cols): unique_labels = sorted(set(label for label in mentions_of_group[label_col] if label is not None))\ if not hasattr(mentions_of_group[label_col], 'cat') else mentions_of_group[label_col].cat.categories label_categories[tag_name] = unique_labels small[tag_name] = small[tag_name].fillna("O").astype( pd.CategoricalDtype([ "O", *(tag for label in unique_labels for tag in ("B-" + str(label), "I-" + str(label))) ] if tag_scheme == "bio" else [ "O", *(tag for label in unique_labels for tag in ("B-" + str(label), "I-" + str(label), "L-" + str(label), "U-" + str(label))) ])) except Exception: raise Exception( f"Error occured during the encoding of label column '{label_col}' into tag '{tag_name}'" ) # return small[doc_id_cols + small_id_cols].merge(merged, how='left') return small, label_categories