def load_anno(self, annofilename, annotype='unannotated', basename=''): """ Load Glozz annotations """ def nid(id): """ Actual, unique id for unit Some ids are re-used from one file to another """ #~ return "{0}_{1}".format(id, self.delta) return id def common(src, tgt): tgt._base = self tgt.id = nid(src.id) tgt.oid = src.id cr = src.characterisation tgt.type = cr.type._val() tgt.features = dict((f.name, f._val()) for f in cr.featureSet.all('feature')) tgt.inRelation = list() tgt.ordRelation = dict() tgt.inSchema = list() return (tgt.id in self.ids) self.annoname, self.annotype, self.basename = ( annofilename, annotype, basename) annoelt = nxml.load(annofilename) for ru in annoelt.all('unit'): u = Record('unit') if common(ru, u): continue u.startPos = int(ru.positioning.start.singlePosition.index) u.endPos = int(ru.positioning.end.singlePosition.index) self.units.append(u) # Make sure they're sorted self.units = sorted(self.units, key=lambda x:x.endPos) for rr in annoelt.all('relation'): r = Record('relation') if common(rr, r): continue r.nodes = list(nid(t.id) for t in rr.positioning.all('term')) self.relations.append(r) for rs in annoelt.all('schema'): s = Record('schema') if common(rs, s): continue s.nodes = list(nid(t.id) for t in chain( rs.positioning.all('embedded-unit'), rs.positioning.all('embedded-relation'), rs.positioning.all('embedded-schema') )) self.schemas.append(s) # Annotation parsing completed self.delta += 1
def load_anno(self, annofilename, annotype='unannotated', basename=''): """ Load Glozz annotations """ def nid(id): """ Actual, unique id for unit Some ids are re-used from one file to another """ #~ return "{0}_{1}".format(id, self.delta) return id def common(src, tgt): tgt._base = self tgt.id = nid(src.id) tgt.oid = src.id cr = src.characterisation tgt.type = cr.type._val() tgt.features = dict( (f.name, f._val()) for f in cr.featureSet.all('feature')) tgt.inRelation = list() tgt.ordRelation = dict() tgt.inSchema = list() return (tgt.id in self.ids) self.annoname, self.annotype, self.basename = (annofilename, annotype, basename) annoelt = nxml.load(annofilename) for ru in annoelt.all('unit'): u = Record('unit') if common(ru, u): continue u.startPos = int(ru.positioning.start.singlePosition.index) u.endPos = int(ru.positioning.end.singlePosition.index) self.units.append(u) # Make sure they're sorted self.units = sorted(self.units, key=lambda x: x.endPos) for rr in annoelt.all('relation'): r = Record('relation') if common(rr, r): continue r.nodes = list(nid(t.id) for t in rr.positioning.all('term')) self.relations.append(r) for rs in annoelt.all('schema'): s = Record('schema') if common(rs, s): continue s.nodes = list( nid(t.id) for t in chain(rs.positioning.all('embedded-unit'), rs.positioning.all('embedded-relation'), rs.positioning.all('embedded-schema'))) self.schemas.append(s) # Annotation parsing completed self.delta += 1
def __init__(self, annofilename): """ Class initialiser annofilename : filename with XML data """ self.tokens = list() self.sen_ids = [0] annoelt = nxml.load(annofilename) for s in annoelt.document.sentences.all('sentence'): for t in s.tokens.all('token'): d = tuple(self.unescape(t.one(n)._val()) for n in ('word','lemma')) self.tokens.append(d) self.sen_ids.append(len(self.tokens))
def __init__(self, annofilename): """ Class initialiser annofilename : filename with XML data """ self.tokens = list() self.sen_ids = [0] annoelt = nxml.load(annofilename) for s in annoelt.document.sentences.all('sentence'): for t in s.tokens.all('token'): d = tuple( self.unescape(t.one(n)._val()) for n in ('word', 'lemma')) self.tokens.append(d) self.sen_ids.append(len(self.tokens))
def load_anno(self, annofilename): def nid(id): return "{0}_{1}".format(id, self.delta) def common(src, tgt): tgt._base = self tgt.id = nid(src.id) cr = src.characterisation tgt.type = cr.type._val() tgt.features = dict((f.name, f._val()) for f in cr.featureSet.all('feature')) tgt.inRelation = list() tgt.ordRelation = dict() tgt.inSchema = list() return (tgt.id in self.ids) annoelt = nxml.load(annofilename) for ru in annoelt.all('unit'): u = Record('unit') if common(ru, u): continue u.startPos = int(ru.positioning.start.singlePosition.index) u.endPos = int(ru.positioning.end.singlePosition.index) self.units.append(u) for rr in annoelt.all('relation'): r = Record('relation') if common(rr, r): continue r.nodes = list(nid(t.id) for t in rr.positioning.all('term')) self.relations.append(r) for rs in annoelt.all('schema'): s = Record('schema') if common(rs, s): continue s.nodes = list(nid(t.id) for t in rs.positioning.all('embedded-unit')) self.schemas.append(s) # Annotation parsing completed self.delta += 1