def make_igt_raw_tier(block, options): items = [Item(id='r{}'.format(j+1), attributes=a, text=t) for j, (a, t) in enumerate(block.get('lines', []))] tier = Tier( id='r', type='odin', attributes={'state': 'raw'}, items=items ) return tier
def make_igt_raw_tier(block, options): items = [] for j, linedata in enumerate(block.get('lines', [])): text = replace_invalid_xml_chars(linedata.get('content', ''), options['replacement_char']) attrs = linedata.copy() del attrs['content'] items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text)) tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items) return tier
def make_phrase_tier(tier_id, aln_tokens): return Tier( id=tier_id, type='phrases', items=[ Item( id='{}1'.format(tier_id), text=' '.join(t for aln in aln_tokens for t in aln[1]) ) ] )
def add_normalized_tier(igt, options): orig_tier = igt.get('c', default=igt['r']) norm_items = normalize_items(orig_tier.items) tier = Tier( id='n', type='odin', alignment=orig_tier.id, attributes={'state': 'normalized'}, items=norm_items ) igt.append(tier)
def add_cleaned_tier(igt, options): raw_tier = igt['r'] cleaned_items = clean_items(raw_tier.items) tier = Tier( id='c', type='odin', alignment=raw_tier.id, attributes={'state': 'cleaned'}, items=cleaned_items ) igt.append(tier)
def default_decode_tier(elem): ns, tag = _qname_split(elem.tag) assert tag == 'tier' tier = Tier( id=elem.get('id'), type=elem.get('type'), attributes=get_attributes(elem, ignore=('id', 'type')), metadata=[decode_metadata(md) for md in elem.findall('metadata')], items=[decode_item(item) for item in elem.findall('item')], namespace=ns, nsmap=elem.attrib.nsmap) elem.clear() return tier
def add_normalized_tier(igt, base_tier): norm_id = None # check if ID is available for n_id in ('n', 'on', 'normalized', 'odin-normalized'): if igt.get(n_id) is None: norm_id = n_id break if norm_id is None: logging.warning('No preset ID for normalized tier was available ' 'for IGT with id: {}'.format(str(igt.id))) else: norm_items = normalize_items(base_tier, norm_id) tier = Tier(id=norm_id, type='odin', alignment=base_tier.id, attributes={'state': 'normalized'}, items=norm_items) igt.append(tier)
def add_cleaned_tier(igt, raw_tier): clean_id = None # check if ID is available for c_id in ('c', 'oc', 'cleaned', 'odin-cleaned'): if igt.get(c_id) is None: clean_id = c_id break if clean_id is None: logging.warning( 'No preset ID for cleaned tier was available for IGT with id: {}'. format(str(igt.id))) else: cleaned_items = clean_items(raw_tier, clean_id) tier = Tier(id=clean_id, type='odin', alignment=raw_tier.id, attributes={'state': 'cleaned'}, items=cleaned_items) igt.append(tier)
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aligned_tokens == [(None, None)]: pass # nothing to do elif algn_tier is not None: attrs['alignment'] = algn_tier.id algn_data = zip_longest(algn_tier.items, aligned_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data assert tgt_tok == tgt_item.text # FIXME is this necessary? for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={'alignment': tgt_item.id})) i += 1 else: for tgt, src in aligned_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aln_tokens == [(None, None)]: pass # nothing to do elif refattr is not None and algn_tier is not None: attrs[refattr] = algn_tier.id algn_data = zip_longest(algn_tier.items, aln_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={refattr:tgt_item.id}) ) i += 1 else: for tgt, src in aln_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)