def test_basic(self): i = Item(id='i1', type='basic', attributes={'attr':'val'}, content='content') self.assertEqual(i.type, 'basic') self.assertEqual(i.id, 'i1') self.assertEqual(i.tier, None) self.assertEqual(i.igt, None) self.assertEqual(i.corpus, None) self.assertEqual(i.attributes, {'attr':'val'}) self.assertEqual(i.content, 'content') # sub-spans of null content is also null content self.assertEqual(i.span(0,1), 'c')
def test_empty(self): i = Item() # empty members self.assertEqual(i.type, None) self.assertEqual(i.id, None) self.assertEqual(i.tier, None) self.assertEqual(i.igt, None) self.assertEqual(i.corpus, None) self.assertEqual(i.attributes, dict()) self.assertEqual(i.content, None) # sub-spans of null content is also null content self.assertEqual(i.span(0,1), None)
def remove_citations(items): def removable(m, t, i): # citation matches are removable if they don't look like # translation alternates or bracketed glosses if t in ('L', 'G'): start, end = m.span() other = None if t == 'L': # look down then up for nearest G others = items[i + 1:] + items[i - 1::-1] t2 = 'G' else: # look up then down for nearest L others = items[i - 1:] + items[i - 1::-1] t2 = 'L' other = next((i for i in others if get_tags(i)[0] == t2), None) if other and (other.text or '')[start:end].strip() != '': return False elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES), m.group('inner1') or m.group('inner2'), re.U): return False return True new_items = [] for i, item in enumerate(items): new_items.append(item) # add now; text might be modified later tags = get_tags(item) if tags[0] not in ('L', 'G', 'T', 'L-G', 'L-T', 'L-G-T'): continue match = citation_re.search(item.text) if (match and removable(match, tags[0], i)): meta_item = Item(id=item.id, text=match.group(0).strip(), attributes=item.attributes) m_tags = ['M'] item.text = citation_re.sub('', item.text).rstrip() if 'AC' in tags: tags.remove('AC') m_tags.append('AC') elif 'LN' in tags: tags.remove('LN') m_tags.append('LN') elif 'CN' in tags: tags.remove('CN') m_tags.append('CN') # what about other tags? LN, CN, EX item.attributes['tag'] = '+'.join(tags) meta_item.attributes['tag'] = '+'.join(m_tags) new_items.append(meta_item) return new_items
def copy_items(items): return [ Item(id=item.id, type=item.type, attributes=item.attributes, text=item.text) for item in items ]
def copy_items(items): return [ Item(id=item.id, type=item.type, alignment=item.alignment, content=item.content, segmentation=item.segmentation, attributes=item.attributes, text=item.text) for item in items ]
def make_igt_raw_tier(block, options): items = [Item(id='r{}'.format(j+1), attributes=a, text=t) for j, (a, t) in enumerate(block.get('lines', []))] tier = Tier( id='r', type='odin', attributes={'state': 'raw'}, items=items ) return tier
def make_igt_raw_tier(block, options): items = [] for j, linedata in enumerate(block.get('lines', [])): text = replace_invalid_xml_chars(linedata.get('content', ''), options['replacement_char']) attrs = linedata.copy() del attrs['content'] items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text)) tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items) return tier
def default_decode_item(elem): ns, tag = _qname_split(elem.tag) assert tag == 'item' item = Item(id=elem.get('id'), type=elem.get('type'), attributes=get_attributes(elem, ignore=('id', 'type')), text=elem.text, namespace=ns, nsmap=elem.attrib.nsmap) elem.clear() return item
def make_phrase_tier(tier_id, aln_tokens): return Tier( id=tier_id, type='phrases', items=[ Item( id='{}1'.format(tier_id), text=' '.join(t for aln in aln_tokens for t in aln[1]) ) ] )
def setUp(self): # empty self.i1 = Item() # basic info self.i2 = Item( id='i2', type='basic', attributes={'attr':'val'}, text='text' ) # alignment and content refs self.i_ac = Item( id='i_ac', alignment='i2', content='i2[0:2]' ) # segmentation ref self.i_s = Item( id='i_s', segmentation='i2[2:4]' ) # override content ref with text self.i_t = Item( id='i_t', content='i2', text='something else' ) # contextual structure self.t_a = Tier(id='t_a', items=[self.i2]) self.t_b = Tier(id='t_b', items=[self.i_ac, self.i_t], alignment='t_a', content='t_a') self.t_c = Tier(id='t_c', items=[self.i_s], segmentation='t_a') self.igt = Igt(tiers=[self.t_a, self.t_b, self.t_c]) self.xc = XigtCorpus(igts=[self.igt])
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aligned_tokens == [(None, None)]: pass # nothing to do elif algn_tier is not None: attrs['alignment'] = algn_tier.id algn_data = zip_longest(algn_tier.items, aligned_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data assert tgt_tok == tgt_item.text # FIXME is this necessary? for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={'alignment': tgt_item.id})) i += 1 else: for tgt, src in aligned_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aln_tokens == [(None, None)]: pass # nothing to do elif refattr is not None and algn_tier is not None: attrs[refattr] = algn_tier.id algn_data = zip_longest(algn_tier.items, aln_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={refattr:tgt_item.id}) ) i += 1 else: for tgt, src in aln_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def test_get_attribute(self): i = Item(id='i1') assert i.get_attribute('attr') == None assert i.get_attribute('attr', 1) == 1 i.attributes['attr'] = 'val' assert i.get_attribute('attr', 1) == 'val' assert i.get_attribute('abc', inherit=True) == None t = Tier(id='t', items=[i], attributes={'abc': 'def'}) assert i.get_attribute('abc', inherit=True) == 'def' assert self.i1.get_attribute('attr') == None assert self.i1.get_attribute('attr', 1) == 1 assert self.i2.get_attribute('attr') == 'val' assert self.i2.get_attribute('attr', 1) == 'val' assert self.i_ac.get_attribute('alignment') == 'i2'
def test_get_attribute(self): i = Item(id='i1') self.assertEqual(i.get_attribute('attr'), None) self.assertEqual(i.get_attribute('attr', 1), 1) i.attributes['attr'] = 'val' self.assertEqual(i.get_attribute('attr', 1), 'val') self.assertEqual(i.get_attribute('abc', inherit=True), None) t = Tier(id='t', items=[i], attributes={'abc': 'def'}) self.assertEqual(i.get_attribute('abc', inherit=True), 'def') self.assertEqual(self.i1.get_attribute('attr'), None) self.assertEqual(self.i1.get_attribute('attr', 1), 1) self.assertEqual(self.i2.get_attribute('attr'), 'val') self.assertEqual(self.i2.get_attribute('attr', 1), 'val') self.assertEqual(self.i_ac.get_attribute('alignment'), 'i2')
def remove_language_name(items, igt): new_items = [] lgcode = xp.find(igt, LANG_CODE_PATH) lgname = xp.find(igt, LANG_NAME_PATH) lgtoks = [] if lgcode and '?' not in lgcode and '*' not in lgcode: codes = set(lgcode.split(':')) # split up complex codes codes.update(map(str.upper, list(codes))) codes.update(map(str.lower, list(codes))) lgtoks.extend(codes) if lgname and '?' not in lgname: lgtoks.append(lgname) lgtoks.append(lgname.upper()) if re.search('[- ]', lgname, re.U): # abbreviation for multiword names lgtoks.append(''.join(ln[0] for ln in re.split(r'[- ]+', lgname, re.U))) if re.search(r'^\w{3}', lgname, re.U): lgtoks.append(lgname[:3]) if lgtoks: sig = '|'.join(re.escape(t) for t in lgtoks) start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U) end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U) for item in items: new_items.append(item) # add now; might be modified later tags = get_tags(item) if tags[0] != 'M': orig = item.text m = start_lg_re.match(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' new_items.append(meta_item) item.text = start_lg_re.sub(whitespace, item.text) m = end_lg_re.search(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' items.append(meta_item) item.text = end_lg_re.sub(whitespace, item.text).rstrip() if 'LN' in tags and item.text != orig: tags.remove('LN') item.attributes['tag'] = '+'.join(tags) else: new_items = items return new_items
def test_resolve_ref(self): # item has no reference attribute b1 = Item(id='b1') self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # has a reference attribute, but is not contained by a tier b1.alignment = 'a1' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in tier, but tier has no reference attribute t_b = Tier(id='b', items=[b1]) self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # tier has reference attribute, but is not contained by an Igt t_b.alignment = 'a' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in IGT, but referred tier doesn't exist igt = Igt(tiers=[t_b]) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred tier exists, but has no item referred by item's alignment t_a = Tier(id='a') igt.append(t_a) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred item exists, but has no value (which resolves to '') a1 = Item(id='a1') t_a.append(a1) self.assertEqual(b1.resolve_ref('alignment'), '') # referred item has a value a1.text = 'text' self.assertEqual(b1.resolve_ref('alignment'), 'text') # stored item tests self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment') self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment') self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text') self.assertEqual(self.i_ac.resolve_ref('content'), 'te') self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt') self.assertEqual(self.i_t.resolve_ref('content'), 'text')
class TestItem(unittest.TestCase): def setUp(self): # empty self.i1 = Item() # basic info self.i2 = Item( id='i2', type='basic', attributes={'attr':'val'}, text='text' ) # alignment and content refs self.i_ac = Item( id='i_ac', alignment='i2', content='i2[0:2]' ) # segmentation ref self.i_s = Item( id='i_s', segmentation='i2[2:4]' ) # override content ref with text self.i_t = Item( id='i_t', content='i2', text='something else' ) # contextual structure self.t_a = Tier(id='t_a', items=[self.i2]) self.t_b = Tier(id='t_b', items=[self.i_ac, self.i_t], alignment='t_a', content='t_a') self.t_c = Tier(id='t_c', items=[self.i_s], segmentation='t_a') self.igt = Igt(tiers=[self.t_a, self.t_b, self.t_c]) self.xc = XigtCorpus(igts=[self.igt]) def test_init(self): self.assertRaises(ValueError, Item, id='1') # invalid id def test_id(self): self.assertIs(self.i1.id, None) self.assertEqual(self.i2.id, 'i2') self.assertEqual(self.i_ac.id, 'i_ac') self.assertEqual(self.i_s.id, 'i_s') self.assertEqual(self.i_t.id, 'i_t') def test_type(self): self.assertIs(self.i1.type, None) self.assertEqual(self.i2.type, 'basic') self.assertIs(self.i_ac.type, None) self.assertIs(self.i_s.type, None) self.assertIs(self.i_t.type, None) def test_parents(self): self.assertIs(self.i1.tier, None) self.assertIs(self.i1.igt, None) self.assertIs(self.i1.corpus, None) self.assertIs(self.i2.tier, self.t_a) self.assertIs(self.i2.igt, self.igt) self.assertIs(self.i2.corpus, self.xc) self.assertEqual(self.i_ac.tier, self.t_b) self.assertEqual(self.i_ac.igt, self.igt) self.assertEqual(self.i_ac.corpus, self.xc) self.assertEqual(self.i_s.tier, self.t_c) self.assertEqual(self.i_s.igt, self.igt) self.assertEqual(self.i_s.corpus, self.xc) self.assertEqual(self.i_t.tier, self.t_b) self.assertEqual(self.i_t.igt, self.igt) self.assertEqual(self.i_t.corpus, self.xc) def test_attributes(self): self.assertEqual(self.i1.attributes, dict()) self.assertEqual(self.i2.attributes, {'attr':'val'}) self.assertEqual(self.i_ac.attributes, {'alignment': 'i2', 'content': 'i2[0:2]'}) self.assertEqual(self.i_s.attributes, {'segmentation': 'i2[2:4]'}) self.assertEqual(self.i_t.attributes, {'content': 'i2'}) def test_reference_attributes(self): # segmentation cannot co-occur with alignment or content self.assertRaises(XigtError, Item, alignment='a1', segmentation='b1') self.assertRaises(XigtError, Item, content='a1', segmentation='b1') self.assertIs(self.i1.alignment, None) self.assertIs(self.i1.content, None) self.assertIs(self.i1.segmentation, None) self.assertIs(self.i2.alignment, None) self.assertIs(self.i2.content, None) self.assertIs(self.i2.segmentation, None) self.assertEqual(self.i_ac.alignment, 'i2') self.assertEqual(self.i_ac.content, 'i2[0:2]') self.assertIs(self.i_ac.segmentation, None) self.assertIs(self.i_s.alignment, None) self.assertIs(self.i_s.content, None) self.assertEqual(self.i_s.segmentation, 'i2[2:4]') self.assertEqual(self.i_t.alignment, None) self.assertEqual(self.i_t.content, 'i2') self.assertEqual(self.i_t.segmentation, None) def test_text(self): self.assertIs(self.i1.text, None) self.assertEqual(self.i2.text, 'text') self.assertIs(self.i_ac.text, None) self.assertIs(self.i_s.text, None) self.assertEqual(self.i_t.text, 'something else') def test_value(self): self.assertIs(self.i1.value(), None) self.assertEqual(self.i2.value(), 'text') self.assertEqual(self.i_ac.value(), 'te') self.assertEqual(self.i_s.value(), 'xt') self.assertEqual(self.i_t.value(), 'something else') def test_resolve_ref(self): # item has no reference attribute b1 = Item(id='b1') self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # has a reference attribute, but is not contained by a tier b1.alignment = 'a1' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in tier, but tier has no reference attribute t_b = Tier(id='b', items=[b1]) self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # tier has reference attribute, but is not contained by an Igt t_b.alignment = 'a' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in IGT, but referred tier doesn't exist igt = Igt(tiers=[t_b]) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred tier exists, but has no item referred by item's alignment t_a = Tier(id='a') igt.append(t_a) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred item exists, but has no value (which resolves to '') a1 = Item(id='a1') t_a.append(a1) self.assertEqual(b1.resolve_ref('alignment'), '') # referred item has a value a1.text = 'text' self.assertEqual(b1.resolve_ref('alignment'), 'text') # stored item tests self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment') self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment') self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text') self.assertEqual(self.i_ac.resolve_ref('content'), 'te') self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt') self.assertEqual(self.i_t.resolve_ref('content'), 'text') def test_span(self): # sub-spans of null content is also null content self.assertIs(self.i1.span(0,1), None) self.assertEqual(self.i2.span(0,1), 't') self.assertEqual(self.i_ac.span(1,2), 'e') self.assertEqual(self.i_s.span(1,2), 't') self.assertEqual(self.i_t.span(1,2), 'o') def test_get_attribute(self): i = Item(id='i1') self.assertEqual(i.get_attribute('attr'), None) self.assertEqual(i.get_attribute('attr', 1), 1) i.attributes['attr'] = 'val' self.assertEqual(i.get_attribute('attr', 1), 'val') self.assertEqual(i.get_attribute('abc', inherit=True), None) t = Tier(id='t', items=[i], attributes={'abc': 'def'}) self.assertEqual(i.get_attribute('abc', inherit=True), 'def') self.assertEqual(self.i1.get_attribute('attr'), None) self.assertEqual(self.i1.get_attribute('attr', 1), 1) self.assertEqual(self.i2.get_attribute('attr'), 'val') self.assertEqual(self.i2.get_attribute('attr', 1), 'val') self.assertEqual(self.i_ac.get_attribute('alignment'), 'i2')
def separate_secondary_translations(items): # sometimes translation lines with secondary translations are marked # as +DB even if they are for the same, single IGT for item in items: tags = get_tags(item) if tags[0] in ('L', 'G', 'L-G') and 'DB' in tags[1:]: # don't attempt return items indent = min_indent(items, tags=('L', 'G', 'L-G', 'L-G-T', 'G-T')) new_items = [] for item in items: tags = get_tags(item) text = item.text if (tags[0] == 'T' and 'CR' not in tags[1:]): text = re.sub( r'([{cq}])\s*(\s|/)\s*([{oq}])'.format(oq=OPENQUOTES, cq=CLOSEQUOTES), r'\1 \2 \3', text, re.I | re.U) matches = [ m for m in basic_quoted_trans_re.finditer(text) if m.group('t').strip() ] sub_items = [] if matches: pos = 0 bare_T_seen = False last_i = len(matches) - 1 for i, match in enumerate(matches): start, end = match.start(), match.end() t = match.group('t') if i == last_i and re.search(r'\w|\d', text[end:], re.U): t += text[match.end():] pre = text[pos:match.start()] # some instances have bad matches... try to avoid with # a hard limit of 30 chars for the note or note is 2x # size of t prelen = len(pre.strip()) if prelen > 30 or prelen >= (2 * len(t.strip())): sub_items = [] new_items.append(item) break new_tags = list(tags) if re.search(r'lit(?:eral(?:ly)?)?', pre): if 'LT' not in new_tags: new_tags.append('LT') elif (re.search(r'(or|also|ii+|\b[bcd]\.)[ :,]', pre) or bare_T_seen): if 'AL' not in new_tags: new_tags.append('AL') else: bare_T_seen = True attrs = dict(item.attributes) if match.group('judg'): attrs['judgment'] = match.group('judg') if re.search(r'\w|\d', pre, re.U): attrs['note'] = pre.strip() attrs['tag'] = '+'.join(new_tags) sub_items.append( Item(id=item.id + '_{}'.format(i + 1), attributes=attrs, text=t)) pos = end new_items.extend(sub_items) else: new_items.append(item) else: new_items.append(item) return new_items