Пример #1
0
def _fuse_corpus(corpus, postags):
    "Merge any dialogue/unit level documents together"
    to_delete = []
    for key in corpus:
        if key.stage == 'unannotated':
            # slightly abusive use of fuse_edus to just get the effect of
            # having EDUs that behave like contexts
            #
            # context: feature extraction for live mode dialogue acts
            # extraction, so by definition we don't have a units stage
            corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key])
        elif key.stage == 'units':
            # similar Context-only abuse of fuse-edus (here, we have a units
            # stage but no dialogue to make use of)
            #
            # context: feature extraction for
            # - live mode discourse parsing (by definition we don't have a
            #   discourse stage yet, but we might have a units stage
            #   inferred earlier in the parsing pipeline)
            # - dialogue act annotation from corpus
            corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key])
        elif key.stage == 'discourse':
            ukey = twin_key(key, 'units')
            corpus[key] = fuse_edus(corpus[key], corpus[ukey], postags[key])
            to_delete.append(ukey)
    for key in to_delete:
        del corpus[key]
Пример #2
0
def _fuse_corpus(corpus, postags):
    "Merge any dialogue/unit level documents together"
    to_delete = []
    for key in corpus:
        if key.stage == 'unannotated':
            # slightly abusive use of fuse_edus to just get the effect of
            # having EDUs that behave like contexts
            #
            # context: feature extraction for live mode dialogue acts
            # extraction, so by definition we don't have a units stage
            corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key])
        elif key.stage == 'units':
            # similar Context-only abuse of fuse-edus (here, we have a units
            # stage but no dialogue to make use of)
            #
            # context: feature extraction for
            # - live mode discourse parsing (by definition we don't have a
            #   discourse stage yet, but we might have a units stage
            #   inferred earlier in the parsing pipeline)
            # - dialogue act annotation from corpus
            corpus[key] = fuse_edus(corpus[key], corpus[key], postags[key])
        elif key.stage == 'discourse':
            ukey = twin_key(key, 'units')
            corpus[key] = fuse_edus(corpus[key], corpus[ukey], postags[key])
            to_delete.append(ukey)
    for key in to_delete:
        del corpus[key]
Пример #3
0
def _apppend_subdoc_entry(settings, hlist, key):
    """
    Append a bullet point for a given subdocument, pointing to
    any reports and helper elements we may have generated
    """
    report = settings.report
    mk_report_path =\
        lambda k, odir: report.mk_output_path(odir, k, '.report.html')
    mk_svg_path =\
        lambda k, odir: report.mk_output_path(odir, k, '.svg')
    k_review = twin_key(key, 'review')
    k_units = twin_key(key, 'units')
    k_discourse = twin_key(key, 'discourse')
    h_sub_li = h.elem(hlist, 'li', text=' (' + key.subdoc + ')')
    add_element(settings, k_units, h_sub_li, issues_descr(report, k_units),
                mk_report_path)
    add_element(settings, k_discourse, h_sub_li,
                issues_descr(report, k_discourse), mk_report_path)
    add_element(settings, k_discourse, h_sub_li, 'graph', mk_svg_path)
    add_element(settings, k_review, h_sub_li, 'parses', parsed_file_name)
Пример #4
0
 def __init_read_corpus(self, is_interesting, corpus_dir):
     """
     Read the corpus specified in our args
     """
     reader = stac.Reader(corpus_dir)
     all_files = reader.files()
     self.anno_files = reader.filter(all_files, is_interesting)
     interesting = self.anno_files.keys()
     for key in interesting:
         ukey = twin_key(key, 'unannotated')
         if ukey in all_files:
             self.anno_files[ukey] = all_files[ukey]
     self.corpus = reader.slurp(self.anno_files, verbose=True)
     self.contexts = {k: Context.for_edus(self.corpus[k])
                      for k in self.corpus}
Пример #5
0
def _apppend_subdoc_entry(settings, hlist, key):
    """
    Append a bullet point for a given subdocument, pointing to
    any reports and helper elements we may have generated
    """
    report = settings.report
    mk_report_path =\
        lambda k, odir: report.mk_output_path(odir, k, '.report.html')
    mk_svg_path =\
        lambda k, odir: report.mk_output_path(odir, k, '.svg')
    k_review = twin_key(key, 'review')
    k_units = twin_key(key, 'units')
    k_discourse = twin_key(key, 'discourse')
    h_sub_li = h.elem(hlist, 'li', text=' (' + key.subdoc + ')')
    add_element(settings, k_units, h_sub_li,
                issues_descr(report, k_units),
                mk_report_path)
    add_element(settings, k_discourse, h_sub_li,
                issues_descr(report, k_discourse),
                mk_report_path)
    add_element(settings, k_discourse, h_sub_li, 'graph',
                mk_svg_path)
    add_element(settings, k_review, h_sub_li, 'parses',
                parsed_file_name)
Пример #6
0
 def __init_read_corpus(self, is_interesting, corpus_dir):
     """
     Read the corpus specified in our args
     """
     reader = stac.Reader(corpus_dir)
     all_files = reader.files()
     self.anno_files = reader.filter(all_files, is_interesting)
     interesting = list(self.anno_files)  # or list(self.anno_files.keys())
     for key in interesting:
         ukey = twin_key(key, 'unannotated')
         if ukey in all_files:
             self.anno_files[ukey] = all_files[ukey]
     self.corpus = reader.slurp(self.anno_files, verbose=True)
     self.contexts = {k: Context.for_edus(self.corpus[k])
                      for k in self.corpus}
Пример #7
0
def cross_check_against(inputs, key1, stage='unannotated'):
    """
    Compare annotations with their equivalents on a twin document
    in the corpus
    """
    key2 = twin_key(key1, stage)
    try:
        missing = cross_check_units(inputs, key2, key1,
                                    MissingItem.missing_status)
        excess = cross_check_units(inputs, key1, key2,
                                   MissingItem.excess_status)
        mismatches = check_unit_ids(inputs, key1, key2)
        missing_excess = []
        for vals in missing.values():
            missing_excess.extend(vals)
        for vals in excess.values():
            missing_excess.extend(vals)

        return sorted_first_widest(missing_excess), mismatches
    except MissingDocumentException as oops:
        print("ARGH! Can't cross-check ", oops.k, sys.stderr)
        return ({}, {})
Пример #8
0
def cross_check_against(inputs, key1, stage='unannotated'):
    """
    Compare annotations with their equivalents on a twin document
    in the corpus
    """
    key2 = twin_key(key1, stage)
    try:
        missing = cross_check_units(inputs, key2, key1,
                                    MissingItem.missing_status)
        excess = cross_check_units(inputs, key1, key2,
                                   MissingItem.excess_status)
        mismatches = check_unit_ids(inputs, key1, key2)
        missing_excess = []
        for vals in missing.values():
            missing_excess.extend(vals)
        for vals in excess.values():
            missing_excess.extend(vals)

        return sorted_first_widest(missing_excess), mismatches
    except MissingDocumentException as oops:
        print("ARGH! Can't cross-check ", oops.k, sys.stderr)
        return ({}, {})