예제 #1
0
    def mk_unit(partial, counter):
        "from PartialUnit and counter to Unit"

        # Note that Glozz seems to identify items by the pair of author and
        # creation date, ignoring the unit ID altogether (assumed to be
        # author_date)
        creation_date = mk_creation_date(counter)
        metadata = {
            'author': author,
            'creation-date': creation_date,
            'lastModifier': 'n/a',
            'lastModificationDate': '0'
        }
        unit_id = '_'.join([author, str(counter)])
        return Unit(unit_id, partial.span, partial.type, partial.features,
                    metadata)
예제 #2
0
def read_node(node, context=None):
    def get_one(name, default, ctx=None):
        f = lambda n: read_node(n, ctx)
        return on_single_element(node, default, f, name)

    def get_all(name):
        return list(map(read_node, node.findall(name)))

    if node.tag == 'annotations':
        hashcode = get_one('metadata', '', 'annotations')
        if hashcode is '':
            hashcode = None
        units = get_all('unit')
        rels = get_all('relation')
        schemas = get_all('schema')
        return (hashcode, units, rels, schemas)

    elif node.tag == 'characterisation':
        fs = get_one('featureSet', {})
        unit_type = get_one('type', None)
        return (unit_type, fs)

    elif node.tag == 'feature':
        attr = node.attrib['name']
        val = node.text.strip() if node.text else None
        return (attr, val)

    # TODO throw exception if we see more than one instance of a key
    elif node.tag == 'featureSet':
        return dict(get_all('feature'))

    elif node.tag == 'metadata' and context == 'annotations':
        return node.attrib['corpusHashcode']

    elif node.tag == 'metadata':
        return dict([(t.tag, t.text.strip()) for t in node])

    elif node.tag == 'positioning' and context == 'unit':
        start = get_one('start', None)
        end = get_one('end', None)
        return Span(start, end)

    elif node.tag == 'positioning' and context == 'relation':
        terms = get_all('term')
        if len(terms) != 2:
            raise GlozzException("Was expecting exactly 2 terms, but got %d" %
                                 len(terms))
        else:
            return RelSpan(terms[0], terms[1])

    elif node.tag == 'positioning' and context == 'schema':
        units = frozenset(get_all('embedded-unit'))
        relations = frozenset(get_all('embedded-relation'))
        schemas = frozenset(get_all('embedded-schema'))
        return units, relations, schemas

    elif node.tag == 'relation':
        rel_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'relation')
        metadata = get_one('metadata', {})
        return Relation(rel_id, span, unit_type, fs, metadata=metadata)

    if node.tag == 'schema':
        anno_id = node.attrib['id']
        (anno_type, fs) = get_one('characterisation', None)
        units, rels, schemas = get_one('positioning', None, 'schema')
        metadata = get_one('metadata', {})
        return Schema(anno_id,
                      units,
                      rels,
                      schemas,
                      anno_type,
                      fs,
                      metadata=metadata)

    elif node.tag == 'singlePosition':
        return int(node.attrib['index'])

    elif node.tag == 'start' or node.tag == 'end':
        return get_one('singlePosition', None)

    elif node.tag in [
            'term', 'embedded-unit', 'embedded-relation', 'embedded-schema'
    ]:
        return node.attrib['id']

    elif node.tag == 'type':
        return node.text.strip()

    elif node.tag == 'unit':
        unit_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'unit')
        metadata = get_one('metadata', {})
        return Unit(unit_id, span, unit_type, fs, metadata=metadata)
예제 #3
0
 def __init__(self, id, start, end):
     Unit.__init__(self, id, Span(start, end), '', {})
예제 #4
0
    def _mk_doc(self):
        """ Create an educe.annotation.Document from this graph """
        def start(name):
            return ord(name) - ord('a')

        def glozz_id(name):
            return 'du_' + str(start(name))

        def is_edu(name):
            return name not in self.cdus

        anno_units = list()
        anno_cdus = list()
        anno_rels = list()

        for du_name, speaker_set in self.speakers.items():
            # EDU loop
            if not is_edu(du_name):
                continue

            du_start, du_glozz_id = start(du_name), glozz_id(du_name)
            x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment',
                         dict())
            speaker = list(speaker_set)[0]
            turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1),
                        'Turn', {
                            'Identifier': du_start,
                            'Emitter': speaker
                        })

            self.anno_map[du_name] = x_edu
            anno_units.append(x_edu)
            anno_units.append(turn)

        for du_name, sub_names in self.cdus.items():
            x_cdu = Schema(
                glozz_id(du_name),
                set(glozz_id(x) for x in sub_names if is_edu(x)), set(),
                set(glozz_id(x) for x in sub_names if not is_edu(x)),
                'Complex_discourse_unit', dict())
            self.anno_map[du_name] = x_cdu
            anno_cdus.append(x_cdu)

        rel_count = 0
        for src_name in self.down:
            for tgt_name, rel_tag in self.down[src_name]:
                rel_glozz_id = 'rel_' + str(rel_count)
                rel_count += 1
                if rel_tag == 'S':
                    rel_name = 'Q-Elab'
                elif rel_tag == 'C':
                    rel_name = 'Contrast'
                else:
                    raise ValueError('Unknown tag {0}'.format(rel_tag))

                rel = Relation(rel_glozz_id,
                               RelSpan(glozz_id(src_name), glozz_id(tgt_name)),
                               rel_name, dict())
                self.anno_map[(src_name, tgt_name)] = rel
                anno_rels.append(rel)

        dialogue = Unit(
            'dialogue_0',
            Span(0, max(u.text_span().char_end for u in anno_units)),
            'Dialogue', {})
        anno_units.append(dialogue)

        doc = Document(anno_units, anno_rels, anno_cdus,
                       string.ascii_lowercase)
        return doc
예제 #5
0
파일: tests.py 프로젝트: irit-melodi/educe
 def __init__(self, id, start, end):
     Unit.__init__(self, id, Span(start, end), '', {})