def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not (is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation))
def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not(is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation))
def fix_dialogue_boundaries(dir_ling, dir_situ, doc, seg_path=None): """Fix dialogue boundaries in a woven game. Dialogue boundaries are adjusted in the woven version, so they are tighter around the dialogues that existed in the annotated version. Parameters ---------- dir_ling: filepath Path to the folder of the original version of the game. dir_situ: filepath Path to the folder of the woven version of the game. doc: string Name of the game. seg_path: TODO TODO ? """ # select files for this game only, annotator GOLD is_interesting = lambda k: (k.doc == doc and (k.annotator == 'GOLD' or k.annotator is None)) # locate files dir_ling = os.path.abspath(dir_ling) reader_ling = Reader(dir_ling) files_ling = reader_ling.filter(reader_ling.files(), is_interesting) corpus_ling = reader_ling.slurp(cfiles=files_ling, verbose=True) dir_situ = os.path.abspath(dir_situ) reader_situ = Reader(dir_situ) files_situ = reader_situ.filter(reader_situ.files(), is_interesting) corpus_situ = reader_situ.slurp(cfiles=files_situ, verbose=True) # need a TimestampCache to generate unit_id for new dialogues tcache = TimestampCache() for key, doc_situ in sorted(corpus_situ.items()): doc_ling = corpus_ling[key] print(key) doc_situ_fixed = _fix_dialogue_boundaries(tcache, doc_ling, doc_situ) # DEBUG dlgs = sorted((x for x in doc_situ_fixed.units if is_dialogue(x)), key=lambda x: x.span) dlg_beg = [x.span.char_start for x in dlgs] dlg_end = [x.span.char_end for x in dlgs] print(zip(dlg_beg, dlg_end)) # end DEBUG save_document(dir_situ, key, doc_situ_fixed)
def shift_dialogues(doc_src, doc_res, updates, gen): """Transpose dialogue split from target to source document. Remove all dialogues from updates. Parameters ---------- doc_src : Document Source (augmented) document. doc_res : Document Result document, originally a copy of doc_tgt with unshifted annotations. This function modifies `doc_res` by shifting the boundaries of its dialogues according to `updates`, and stretching the first and last dialogues so as to cover the same span as dialogues from `doc_src`. updates : set of updates Updates computed by `compute_updates`. gen: int Generation of annotations included in `doc_src` and the output. Returns ------- updates : Updates Trimmed down set of `updates`: no more dialogue. """ if gen < 3: dlgs_src = sorted([x for x in doc_src.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) dlgs_res = sorted([x for x in doc_res.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) # NEW 2016-06-15 adjust dialogue boundaries # for each target dialogue, find the smallest enclosing sequence of # source dialogues and map to it dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src]) dlgs_tgt_sbeg = np.array([ shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res]) # NB: we need to broadcast (- 1) to get the source dialogue whose # start immediately precedes the start of the shifted target # dialogue tgt2src_beg = (np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1) dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg] # map the shifted end of each target dialogue to the first larger end # of a source dialogue dlgs_src_end = np.array([x.span.char_end for x in dlgs_src]) dlgs_tgt_send = np.array([shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res]) tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send) dlgs_tgt_aend = dlgs_src_end[tgt2src_end] # overwrite the adjusted beginning and end, when a game turn # overlaps with two different tgt dialogues ; # each overlap in the matching signals a split, in the linguistic # version, that happens in the middle of a game turn for i, (end_cur, beg_nxt) in enumerate( zip(tgt2src_end[:-1], tgt2src_beg[1:])): if beg_nxt <= end_cur: # linguistic turns from the same game turn, in different # target dialogues => use the shifted cut point from tgt dlgs_tgt_aend[i] = dlgs_tgt_send[i] dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i] # find source dialogues included in the shifted+expanded target # dialogues dlgs_src_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(tgt2src_beg, tgt2src_end))) dlgs_src_matched = set(dlgs_src_matched) for dlg_res, adj_start, adj_end in zip( dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend): dlg_res.span.char_start = adj_start dlg_res.span.char_end = adj_end # alt: dlg_res.span = Span(start, end) # # optionally, update timestamp, id, span as in # `stac.edit.cmd.split_dialogue.{_actually_split,_set}` # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if (i in dlgs_src_matched and dlg_src in updates.expected_src_only): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) else: # situated version: we can rely on game turns # 1. get the identifier of the first and last turn of each game turn # in _src: these turns and those in between must end up in the same # dialogue turns_src = sorted((x for x in doc_src.units if is_turn(x)), key=lambda x: x.span) turns_src_tid = np.array([x.features['Identifier'] for x in turns_src]) turns_src_beg = np.array([x.span.char_start for x in turns_src]) turns_src_end = np.array([x.span.char_end for x in turns_src]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_src, turns_src, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array( [i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1]) # ... and finally gturn_src_tid_beg = turns_src_tid[gturn_idc_beg] gturn_src_tid_end = turns_src_tid[gturn_idc_end] # 2. get the identifier of the first and last turn of each dialogue # in _res: these turns and those in between must end up in the same # dialogue turns_res = sorted((x for x in doc_res.units if is_turn(x)), key=lambda x: x.span) turns_res_tid = np.array([x.features['Identifier'] for x in turns_res]) turns_res_beg = np.array([x.span.char_start for x in turns_res]) turns_res_end = np.array([x.span.char_end for x in turns_res]) # align dialogue spans with turn spans dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)), key=lambda x: x.span) dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res]) dlgs_res_end = np.array([x.span.char_end for x in dlgs_res]) dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg) dlgs_res_ti_end = np.searchsorted(turns_res_end, dlgs_res_end, side='right') - 1 # ... and finally dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg] dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end] # 3. map _res dialogues to _src game turns dlgs_res_ti_beg = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg]) dlgs_res_ti_end = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_end]) # * align the beginning (resp. end) indices of game turns and _res # dialogues dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end) # * turn indices of the adjusted beginning and end of the _res # dialogues # initialize along the boundaries of game turns dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _res # * compute a partition on dialogues such that any pair of dialogues # overlapping a given game turn are in the same class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip( dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _res dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # keep one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_res_src_abeg[di_beg] ti_end = dlg_res_src_aend[di_end] # keep first dialogue, update its features to include those # from the other dialogues in the same class new_dlg = dlgs_res[di_beg] new_dlg.span.char_start = turns_src_beg[ti_beg] new_dlg.span.char_end = turns_src_end[ti_end] dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_res_merged, feat) # remove merged dialogues [1:] from doc_res for i in dlg_idc_merged[1:]: dlg_res = dlgs_res[i] doc_res.units.remove(dlg_res) # transfer each unmatched (non-overlapping) game turn as a dialogue # (which already exists in doc_src) gturns_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) # each dialogue in doc_src is a game turn dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)), key=lambda x: x.span) # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if (i in gturns_matched and dlg_src in updates.expected_src_only): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) return updates
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError('Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted(anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set( ['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == ['Correctness', 'Kind', 'Quantity', 'Status']) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier() ) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = { anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel'] } for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in [ 'paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference' ]: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = { anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list } return res
def shift_dialogues(doc_src, doc_res, updates, gen): """Transpose dialogue split from target to source document. Remove all dialogues from updates. Parameters ---------- doc_src : Document Source (augmented) document. doc_res : Document Result document, originally a copy of doc_tgt with unshifted annotations. This function modifies `doc_res` by shifting the boundaries of its dialogues according to `updates`, and stretching the first and last dialogues so as to cover the same span as dialogues from `doc_src`. updates : set of updates Updates computed by `compute_updates`. gen: int Generation of annotations included in `doc_src` and the output. Returns ------- updates : Updates Trimmed down set of `updates`: no more dialogue. """ if gen < 3: dlgs_src = sorted( [x for x in doc_src.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) dlgs_res = sorted( [x for x in doc_res.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) # NEW 2016-06-15 adjust dialogue boundaries # for each target dialogue, find the smallest enclosing sequence of # source dialogues and map to it dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src]) dlgs_tgt_sbeg = np.array( [shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res]) # NB: we need to broadcast (- 1) to get the source dialogue whose # start immediately precedes the start of the shifted target # dialogue tgt2src_beg = ( np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1) dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg] # map the shifted end of each target dialogue to the first larger end # of a source dialogue dlgs_src_end = np.array([x.span.char_end for x in dlgs_src]) dlgs_tgt_send = np.array( [shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res]) tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send) dlgs_tgt_aend = dlgs_src_end[tgt2src_end] # overwrite the adjusted beginning and end, when a game turn # overlaps with two different tgt dialogues ; # each overlap in the matching signals a split, in the linguistic # version, that happens in the middle of a game turn for i, (end_cur, beg_nxt) in enumerate(zip(tgt2src_end[:-1], tgt2src_beg[1:])): if beg_nxt <= end_cur: # linguistic turns from the same game turn, in different # target dialogues => use the shifted cut point from tgt dlgs_tgt_aend[i] = dlgs_tgt_send[i] dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i] # find source dialogues included in the shifted+expanded target # dialogues dlgs_src_matched = reduce( np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(tgt2src_beg, tgt2src_end))) dlgs_src_matched = set(dlgs_src_matched) for dlg_res, adj_start, adj_end in zip(dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend): dlg_res.span.char_start = adj_start dlg_res.span.char_end = adj_end # alt: dlg_res.span = Span(start, end) # # optionally, update timestamp, id, span as in # `stac.edit.cmd.split_dialogue.{_actually_split,_set}` # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if ((i in dlgs_src_matched and dlg_src in updates.expected_src_only)): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) else: # situated version: we can rely on game turns # 1. get the identifier of the first and last turn of each game turn # in _src: these turns and those in between must end up in the same # dialogue turns_src = sorted((x for x in doc_src.units if is_turn(x)), key=lambda x: x.span) turns_src_tid = np.array([x.features['Identifier'] for x in turns_src]) turns_src_beg = np.array([x.span.char_start for x in turns_src]) turns_src_end = np.array([x.span.char_end for x in turns_src]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_src, turns_src, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array([i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1]) # ... and finally gturn_src_tid_beg = turns_src_tid[gturn_idc_beg] gturn_src_tid_end = turns_src_tid[gturn_idc_end] # 2. get the identifier of the first and last turn of each dialogue # in _res: these turns and those in between must end up in the same # dialogue turns_res = sorted((x for x in doc_res.units if is_turn(x)), key=lambda x: x.span) turns_res_tid = np.array([x.features['Identifier'] for x in turns_res]) turns_res_beg = np.array([x.span.char_start for x in turns_res]) turns_res_end = np.array([x.span.char_end for x in turns_res]) # align dialogue spans with turn spans dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)), key=lambda x: x.span) dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res]) dlgs_res_end = np.array([x.span.char_end for x in dlgs_res]) dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg) dlgs_res_ti_end = np.searchsorted( turns_res_end, dlgs_res_end, side='right') - 1 # ... and finally dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg] dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end] # 3. map _res dialogues to _src game turns dlgs_res_ti_beg = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg]) dlgs_res_ti_end = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_end]) # * align the beginning (resp. end) indices of game turns and _res # dialogues dlg2gturn_beg = ( np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end) # * turn indices of the adjusted beginning and end of the _res # dialogues # initialize along the boundaries of game turns dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _res # * compute a partition on dialogues such that any pair of dialogues # overlapping a given game turn are in the same class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate( zip(dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _res dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # keep one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_res_src_abeg[di_beg] ti_end = dlg_res_src_aend[di_end] # keep first dialogue, update its features to include those # from the other dialogues in the same class new_dlg = dlgs_res[di_beg] new_dlg.span.char_start = turns_src_beg[ti_beg] new_dlg.span.char_end = turns_src_end[ti_end] dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_res_merged, feat) # remove merged dialogues [1:] from doc_res for i in dlg_idc_merged[1:]: dlg_res = dlgs_res[i] doc_res.units.remove(dlg_res) # transfer each unmatched (non-overlapping) game turn as a dialogue # (which already exists in doc_src) gturns_matched = reduce( np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) # each dialogue in doc_src is a game turn dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)), key=lambda x: x.span) # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if ((i in gturns_matched and dlg_src in updates.expected_src_only)): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) return updates
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = {anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']} for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in ['paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference']: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = {anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list} return res
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_rels = [] # relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in doc_val.units: # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError( 'Wow, a discourse segment has *features*') df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted( anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == [ 'Correctness', 'Kind', 'Quantity', 'Status' ]) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) for anno in doc_val.relations: # attributes common to all(?) types of annotations rel_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier()) warnings.warn(w_msg) rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': anno.source.identifier(), 'target': anno.target.identifier(), }) df_rels.append(rel_dict) # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_rels = pd.DataFrame(df_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_rels, df_res, df_pref)
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError( 'Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted( anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == [ 'Correctness', 'Kind', 'Quantity', 'Status' ]) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier()) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def _fix_dialogue_boundaries(tcache, doc_ling, doc_situ): """Do the job. Parameters ---------- tcache: TimestampCache Timestamp cache to generate unit identifiers for new dialogues. doc_ling: GlozzDocument Linguistic version of the game. doc_situ: GlozzDocument Situated version of the game. Returns ------- doc_situ: GlozzDocument Fixed version of doc_situ. """ doc_key = doc_situ.origin # 1. get the identifier of the first and last turn of each game turn # in _situ: these turns and those in between must end up in the same # dialogue turns_situ = sorted((x for x in doc_situ.units if is_turn(x)), key=lambda x: x.span) turns_situ_tid = np.array([x.features['Identifier'] for x in turns_situ]) turns_situ_beg = np.array([x.span.char_start for x in turns_situ]) turns_situ_end = np.array([x.span.char_end for x in turns_situ]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_situ, turns_situ, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array( [i - 1 for i in gturn_idc[1:]] + [len(turns_situ) - 1]) # ... and finally gturn_situ_tid_beg = turns_situ_tid[gturn_idc_beg] gturn_situ_tid_end = turns_situ_tid[gturn_idc_end] # print('game turns in _situ', zip(gturn_situ_tid_beg, gturn_situ_tid_end)) # 2. get the identifier of the first and last turn of each dialogue in # _ling: these turns and those in between must end up in the same # dialogue turns_ling = sorted((x for x in doc_ling.units if is_turn(x)), key=lambda x: x.span) # DIRTY special processing for pilot02_01 if doc_key.doc == 'pilot02' and doc_key.subdoc == '01': # ignore turns 26-27 that were moved down from _01 to _02 turns_ling = turns_ling[:-2] turns_ling_tid = np.array([x.features['Identifier'] for x in turns_ling]) turns_ling_beg = np.array([x.span.char_start for x in turns_ling]) turns_ling_end = np.array([x.span.char_end for x in turns_ling]) # align dialogue spans with turn spans dlgs_ling = sorted((x for x in doc_ling.units if is_dialogue(x)), key=lambda x: x.span) # DIRTY if doc_key.doc == 'pilot02' and doc_key.subdoc == '01': # turns 26-27 are in the last dialogue in _01, in _ling dlgs_ling = dlgs_ling[:-1] dlgs_ling_beg = np.array([x.span.char_start for x in dlgs_ling]) dlgs_ling_end = np.array([x.span.char_end for x in dlgs_ling]) dlgs_ling_ti_beg = np.searchsorted(turns_ling_beg, dlgs_ling_beg) dlgs_ling_ti_end = np.searchsorted(turns_ling_end, dlgs_ling_end, side='right') - 1 # ... and finally dlgs_ling_tid_beg = turns_ling_tid[dlgs_ling_ti_beg] dlgs_ling_tid_end = turns_ling_tid[dlgs_ling_ti_end] # print('dialogues in _ling', zip(dlgs_ling_tid_beg, dlgs_ling_tid_end)) # 3. map _ling dialogues to _situ game turns # * locate the first and last turn of each _ling dialogue in the # list of turns in _situ # NB: we don't need indices in the list of turns from _ling anymore # hence it is safe to overwrite dlgs_ling_ti_{beg,end} dlgs_ling_ti_beg = np.array( [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_beg]) dlgs_ling_ti_end = np.array( [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_end]) # print('game turns (turn_idx)', zip(gturn_idc_beg, gturn_idc_end)) # print('core dlgs (turn_idx)', zip(dlgs_ling_ti_beg, dlgs_ling_ti_end)) # * align the beginning (resp. end) indices of game turns and _ling # dialogues dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_ling_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_ling_ti_end) # print('map from dlg to gturn', zip(dlg2gturn_beg, dlg2gturn_end)) # * turn indices of the adjusted beginning and end of the _ling # dialogues # initialize along the boundaries of game turns dlg_ling_situ_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_ling_situ_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _ling # * compute a partition on dialogues such that any pair of # dialogues overlapping a given game turn are in the same # class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip( dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _ling dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # remove all dialogues from the units in doc_situ, # they will be replaced with (hopefully) clean ones dlgs_situ = sorted((x for x in doc_situ.units if is_dialogue(x)), key=lambda x: x.span) for dlg_situ in dlgs_situ: doc_situ.units.remove(dlg_situ) # create one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_ling_situ_abeg[di_beg] ti_end = dlg_ling_situ_aend[di_end] # create dialogue, use the 1st _ling dialogue as basis then # customize dlg0 = dlgs_ling[di_beg] new_dlg = copy.deepcopy(dlg0) new_dlg.origin = doc_key new_dlg.span.char_start = turns_situ_beg[ti_beg] new_dlg.span.char_end = turns_situ_end[ti_end] dlgs_ling_merged = [dlgs_ling[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_ling_merged, feat) # add the new dialogue to doc_situ doc_situ.units.append(new_dlg) # create a new dialogue for each unmatched (non-overlapping) game # turn gturns_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) for i, (gturn_idx_beg, gturn_idx_end) in enumerate(zip( gturn_idc_beg, gturn_idc_end)): if i not in gturns_matched: new_dlg_span = Span(turns_situ_beg[gturn_idx_beg], turns_situ_end[gturn_idx_end]) # UGLY this works just like split_dialogue: # create a new dialogue by copying an existing dialogue, # re-assign it an annotation id and span using a timestamp # cache, then erase all features new_dlg = copy.deepcopy(dlgs_situ[0]) _set(tcache, new_dlg_span, new_dlg) new_dlg.features = {} # ... "et voila": add this dialogue to the document doc_situ.units.append(new_dlg) # TODO restore dialogue features from the game events? return doc_situ