def _fuse_corpus(corpus, postags): "Merge any dialogue/unit level documents together" to_delete = [] for key in corpus: if key.stage == 'unannotated': # slightly abusive use of fuse_edus to just get the effect of # having EDUs that behave like contexts # # context: feature extraction for live mode dialogue acts # extraction, so by definition we don't have a units stage corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key]) elif key.stage == 'units': # similar Context-only abuse of fuse-edus (here, we have a units # stage but no dialogue to make use of) # # context: feature extraction for # - live mode discourse parsing (by definition we don't have a # discourse stage yet, but we might have a units stage # inferred earlier in the parsing pipeline) # - dialogue act annotation from corpus corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key]) elif key.stage == 'discourse': ukey = twin_key(key, 'units') corpus[key] = fuse_edus(corpus[key], corpus[ukey], postags[key]) to_delete.append(ukey) for key in to_delete: del corpus[key]
def _apppend_subdoc_entry(settings, hlist, key): """ Append a bullet point for a given subdocument, pointing to any reports and helper elements we may have generated """ report = settings.report mk_report_path =\ lambda k, odir: report.mk_output_path(odir, k, '.report.html') mk_svg_path =\ lambda k, odir: report.mk_output_path(odir, k, '.svg') k_review = twin_key(key, 'review') k_units = twin_key(key, 'units') k_discourse = twin_key(key, 'discourse') h_sub_li = h.elem(hlist, 'li', text=' (' + key.subdoc + ')') add_element(settings, k_units, h_sub_li, issues_descr(report, k_units), mk_report_path) add_element(settings, k_discourse, h_sub_li, issues_descr(report, k_discourse), mk_report_path) add_element(settings, k_discourse, h_sub_li, 'graph', mk_svg_path) add_element(settings, k_review, h_sub_li, 'parses', parsed_file_name)
def __init_read_corpus(self, is_interesting, corpus_dir): """ Read the corpus specified in our args """ reader = stac.Reader(corpus_dir) all_files = reader.files() self.anno_files = reader.filter(all_files, is_interesting) interesting = self.anno_files.keys() for key in interesting: ukey = twin_key(key, 'unannotated') if ukey in all_files: self.anno_files[ukey] = all_files[ukey] self.corpus = reader.slurp(self.anno_files, verbose=True) self.contexts = {k: Context.for_edus(self.corpus[k]) for k in self.corpus}
def __init_read_corpus(self, is_interesting, corpus_dir): """ Read the corpus specified in our args """ reader = stac.Reader(corpus_dir) all_files = reader.files() self.anno_files = reader.filter(all_files, is_interesting) interesting = list(self.anno_files) # or list(self.anno_files.keys()) for key in interesting: ukey = twin_key(key, 'unannotated') if ukey in all_files: self.anno_files[ukey] = all_files[ukey] self.corpus = reader.slurp(self.anno_files, verbose=True) self.contexts = {k: Context.for_edus(self.corpus[k]) for k in self.corpus}
def cross_check_against(inputs, key1, stage='unannotated'): """ Compare annotations with their equivalents on a twin document in the corpus """ key2 = twin_key(key1, stage) try: missing = cross_check_units(inputs, key2, key1, MissingItem.missing_status) excess = cross_check_units(inputs, key1, key2, MissingItem.excess_status) mismatches = check_unit_ids(inputs, key1, key2) missing_excess = [] for vals in missing.values(): missing_excess.extend(vals) for vals in excess.values(): missing_excess.extend(vals) return sorted_first_widest(missing_excess), mismatches except MissingDocumentException as oops: print("ARGH! Can't cross-check ", oops.k, sys.stderr) return ({}, {})