def test_remove(self): xc = XigtCorpus(igts=[Igt(id='i1'), Igt(id='i2')]) assert len(xc) == 2 xc.remove(xc[0]) assert len(xc) == 1 assert xc[0].id == 'i2' with pytest.raises(KeyError): xc['i1']
def write_instances(instance_list, out_path, type, overwrite=False): if os.path.exists(out_path) and not overwrite: SPLIT_LOG.error('File "{}" already exists and overwrite flag not set. Skipping!'.format(out_path)) return else: # Create the directory if need be try: if not os.path.exists(os.path.dirname(out_path)): os.makedirs(os.path.dirname(out_path)) except FileNotFoundError: pass num_sents = len(instance_list) if num_sents > 0: xc = XigtCorpus() for i, inst in enumerate(instance_list): # inst.id = 'i{}'.format(i) xc.append(inst) print("Writing {} instances to {}...".format(num_sents, out_path)) f = open(out_path, 'w', encoding='utf-8') sort_corpus(xc) xigtxml.dump(f, xc) f.close() else: SPLIT_LOG.warn("No instances allocated for {}. Skipping file.".format(type))
def setUp(self): self.c1 = XigtCorpus() self.c2 = XigtCorpus( id='xc1', type='basic', attributes={'attr':'val'}, metadata=[Metadata(type='meta', metas=[Meta(text='meta')])], igts=[Igt(id='i1'), Igt(id='i2')] )
def write(out_fn, fn_idx): xc = XigtCorpus() for fn, igt_indices in fn_idx.items(): # if possible, try to decode needed igts only and skip the rest in_xc = xigtxml.load(fn, mode='transient') # ignoring corpus-level metadata xc.extend(igt for i, igt in enumerate(in_xc) if i in igt_indices) # assume the nsmap of the first igt is the same for all if xc.igts: xc.nsmap = xc[0].nsmap xigtxml.dump(out_fn, xc)
def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
def test_append(self): xc = XigtCorpus() self.assertRaises(XigtStructureError, xc.append, Item()) self.assertRaises(XigtStructureError, xc.append, Tier()) self.assertRaises(XigtStructureError, xc.append, XigtCorpus()) self.assertRaises(XigtStructureError, xc.append, Metadata()) self.assertRaises(XigtStructureError, xc.append, Meta()) self.assertEqual(len(xc), 0) xc.append(Igt(id='i1')) self.assertEqual(len(xc), 1) self.assertRaises(XigtError, xc.append, Igt(id='i1')) xc.append(Igt(id='i2')) self.assertEqual(len(xc), 2) self.assertEqual(xc[0].id, 'i1') self.assertEqual(xc[1].id, 'i2')
def test_insert(self): xc = XigtCorpus() assert len(xc) == 0 xc.insert(0, Igt(id='i1')) assert len(xc) == 1 with pytest.raises(XigtError): xc.insert(0, Igt(id='i1')) xc.insert(0, Igt(id='i2')) xc.insert(100, Igt(id='i3')) assert len(xc) == 3 assert xc[0].id == 'i2' assert xc[1].id == 'i1' assert xc[2].id == 'i3'
def eval_classifier(c, inst_list, context_feats=False, posdict=None): """ :param c: The classifier :param inst_list: A list of Igt instances to test against. Must already have POS tags. """ gold_sents = [] eval_sents = [] to_dump = XigtCorpus() for inst in inst_list: to_tag = inst.copy() strip_pos(to_tag) # Do the classification. to_tag.classify_gloss_pos(c, lowercase=True, feat_next_gram=context_feats, feat_prev_gram=context_feats, posdict=posdict) to_dump.append(to_tag) # Fix the tags... # fix_ctn_gloss_line(to_tag, tag_method=INTENT_POS_CLASS) # Now, retrieve eval/gold. eval_tags = [v.value() for v in to_tag.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)] gold_tags = [v.value() for v in inst.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_MANUAL)] tag_tokens = [POSToken('a', label=l) for l in eval_tags] gold_tokens= [POSToken('a', label=l) for l in gold_tags] if not len(tag_tokens) == len(gold_tokens): print("LENGTH OF SEQUENCE IS MISMATCHED") continue gold_sents.append(gold_tokens) eval_sents.append(tag_tokens) xigtxml.dump(open('./enriched_ctn_dev.xml', 'w'), to_dump) return poseval(eval_sents, gold_sents, details=True,csv=True, matrix=True)
def _xigt_import(infile, outfile, options): with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: igts = odin_igts(in_fh, options) xc = XigtCorpus( igts=igts, nsmap=_nsmap, mode='transient' ) xigtxml.dump(out_fh, xc)
def test_clear(self): xc = XigtCorpus() xc.extend([Igt(id='i1'), Igt(id='i2'), Igt(id='i3')]) assert len(xc) == 3 xc.clear() assert len(xc) == 0 assert xc.get(0) is None assert xc.get('i1') is None
def test_clear(self): xc = XigtCorpus() xc.extend([Igt(id='i1'), Igt(id='i2'), Igt(id='i3')]) self.assertEqual(len(xc), 3) xc.clear() self.assertEqual(len(xc), 0) self.assertIs(xc.get(0), None) self.assertIs(xc.get('i1'), None)
def do_filter(filelist, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0): new_corp = XigtCorpus() FILTER_LOG.log(NORM_LEVEL, "Beginning filtering...") successes = 0 failures = 0 examined = 0 for path in filelist: FILTER_LOG.log(1000, 'Opening file "{}" for filtering.'.format(os.path.basename(path))) xc = xc_load(path, mode=INCREMENTAL) instances, iter_examined, iter_success, iter_failures = filter_xc(xc, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances, successes) for instance in instances: new_corp.append(instance) successes += iter_success failures += iter_failures examined += iter_examined return new_corp, examined, failures, successes
def default_decode_xigtcorpus(elem, igts=None, mode='full'): # xigt-corpus { attrs, metadata, content } # first get the attrs ns, tag = _qname_split(elem.tag) assert tag == 'xigt-corpus' return XigtCorpus( id=elem.get('id'), attributes=get_attributes(elem, ignore=('id', )), metadata=[decode_metadata(md) for md in elem.findall('metadata')], igts=igts or [decode_igt(igt) for igt in elem.findall('igt')], mode=mode, namespace=ns, nsmap=elem.attrib.nsmap)
def test_extend(self): xc = XigtCorpus() assert len(xc) == 0 xc.extend([Igt(id='i1')]) assert len(xc) == 1 xc.extend([]) assert len(xc) == 1 xc.extend([Igt(id='i2'), Igt(id='i3')]) assert len(xc) == 3 assert xc[0].id == 'i1' assert xc[1].id == 'i2' assert xc[2].id == 'i3'
def test_extend(self): xc = XigtCorpus() self.assertEqual(len(xc), 0) xc.extend([Igt(id='i1')]) self.assertEqual(len(xc), 1) xc.extend([]) self.assertEqual(len(xc), 1) xc.extend([Igt(id='i2'), Igt(id='i3')]) self.assertEqual(len(xc), 3) self.assertEqual(xc[0].id, 'i1') self.assertEqual(xc[1].id, 'i2') self.assertEqual(xc[2].id, 'i3')
def test_insert(self): xc = XigtCorpus() self.assertEqual(len(xc), 0) xc.insert(0, Igt(id='i1')) self.assertEqual(len(xc), 1) self.assertRaises(XigtError, xc.insert, 0, Igt(id='i1')) xc.insert(0, Igt(id='i2')) xc.insert(100, Igt(id='i3')) self.assertEqual(len(xc), 3) self.assertEqual(xc[0].id, 'i2') self.assertEqual(xc[1].id, 'i1') self.assertEqual(xc[2].id, 'i3')
def xigt_import(infile, outfile, options=None): if options is None: options = {} options.setdefault('tier_types', default_tier_types) options.setdefault('alignments', default_alignments) options.setdefault('record_markers', default_record_markers) options.setdefault('attribute_map', default_attribute_map) options.setdefault('error_recovery_method', default_error_recovery_method) with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: tb = toolbox.read_toolbox_file(in_fh) igts = toolbox_igts(tb, options) xc = XigtCorpus(igts=igts, mode='transient') xigtxml.dump(out_fh, xc)
def xigt_import(infile, outfile, options=None): if options is None: options = {} options.setdefault('record_markers', default_record_markers) options.setdefault('igt_attribute_map', default_igt_attribute_map) options.setdefault('tier_map', default_tier_map) options.setdefault('make_phrase_tier', default_make_phrase_tier) options.setdefault('tier_types', default_tier_types) options.setdefault('alignments', default_alignments) options.setdefault('error_recovery_method', default_error_recovery_method) # just use existing info to create marker-based alignment info options['tb_alignments'] = _make_tb_alignments(options) with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: tb = toolbox.read_toolbox_file(in_fh) igts = toolbox_igts(tb, options) xc = XigtCorpus(igts=igts, mode='transient') xigtxml.dump(out_fh, xc)
def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def test_get_attribute(self): xc = XigtCorpus(attributes={'one': 1, 'two': 2}) assert xc.get_attribute('one') == 1 assert xc.get_attribute('two') == 2 assert xc.get_attribute('three') is None assert xc.get_attribute('three', inherit=True) == None
class TestXigtCorpus(unittest.TestCase): def setUp(self): self.c1 = XigtCorpus() self.c2 = XigtCorpus( id='xc1', type='basic', attributes={'attr':'val'}, metadata=[Metadata(type='meta', metas=[Meta(text='meta')])], igts=[Igt(id='i1'), Igt(id='i2')] ) def test_init(self): self.assertRaises(ValueError, XigtCorpus, id='1') # invalid id # don't allow multiple igts with the same ID self.assertRaises(XigtError, XigtCorpus, igts=[Igt(id='i1'), Igt(id='i1')]) def test_id(self): self.assertIs(self.c1.id, None) self.assertEqual(self.c2.id, 'xc1') def test_type(self): self.assertIs(self.c1.type, None) self.assertIs(self.c2.type, 'basic') def test_igts(self): self.assertEqual(len(self.c1.igts), 0) self.assertEqual(len(self.c2.igts), 2) # contained Igts should now have their corpus specified for i in self.c2.igts: self.assertIs(i.corpus, self.c2) def test_attributes(self): self.assertEqual(self.c1.attributes, dict()) self.assertEqual(self.c2.attributes, {'attr':'val'}) def test_metadata(self): self.assertEqual(len(self.c1.metadata), 0) self.assertEqual(self.c2.metadata[0].type, 'meta') self.assertEqual(len(self.c2.metadata[0].metas), 1) self.assertEqual(self.c2.metadata[0][0].text, 'meta') def test_get(self): self.assertIs(self.c1.get(0), None) self.assertIs(self.c1.get('i1'), None) self.assertEqual(self.c1.get('i1', default=1), 1) self.assertEqual(self.c2.get(0).id, 'i1') self.assertIs(self.c2.get(3), None) self.assertEqual(self.c2.get('i1').id, 'i1') self.assertEqual( self.c2.get('i1', default=Igt(id='i3')).id, 'i1' ) def test_append(self): xc = XigtCorpus() self.assertRaises(XigtStructureError, xc.append, Item()) self.assertRaises(XigtStructureError, xc.append, Tier()) self.assertRaises(XigtStructureError, xc.append, XigtCorpus()) self.assertRaises(XigtStructureError, xc.append, Metadata()) self.assertRaises(XigtStructureError, xc.append, Meta()) self.assertEqual(len(xc), 0) xc.append(Igt(id='i1')) self.assertEqual(len(xc), 1) self.assertRaises(XigtError, xc.append, Igt(id='i1')) xc.append(Igt(id='i2')) self.assertEqual(len(xc), 2) self.assertEqual(xc[0].id, 'i1') self.assertEqual(xc[1].id, 'i2') def test_insert(self): xc = XigtCorpus() self.assertEqual(len(xc), 0) xc.insert(0, Igt(id='i1')) self.assertEqual(len(xc), 1) self.assertRaises(XigtError, xc.insert, 0, Igt(id='i1')) xc.insert(0, Igt(id='i2')) xc.insert(100, Igt(id='i3')) self.assertEqual(len(xc), 3) self.assertEqual(xc[0].id, 'i2') self.assertEqual(xc[1].id, 'i1') self.assertEqual(xc[2].id, 'i3') def test_extend(self): xc = XigtCorpus() self.assertEqual(len(xc), 0) xc.extend([Igt(id='i1')]) self.assertEqual(len(xc), 1) xc.extend([]) self.assertEqual(len(xc), 1) xc.extend([Igt(id='i2'), Igt(id='i3')]) self.assertEqual(len(xc), 3) self.assertEqual(xc[0].id, 'i1') self.assertEqual(xc[1].id, 'i2') self.assertEqual(xc[2].id, 'i3') def test_clear(self): xc = XigtCorpus() xc.extend([Igt(id='i1'), Igt(id='i2'), Igt(id='i3')]) self.assertEqual(len(xc), 3) xc.clear() self.assertEqual(len(xc), 0) self.assertIs(xc.get(0), None) self.assertIs(xc.get('i1'), None) def test_get_attribute(self): xc = XigtCorpus(attributes={'one': 1, 'two': 2}) self.assertEqual(xc.get_attribute('one'), 1) self.assertEqual(xc.get_attribute('two'), 2) self.assertIs(xc.get_attribute('three'), None) self.assertEqual(xc.get_attribute('three', inherit=True), None)
def filter_xc(xc, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0, prev_good_instances=0): new_corp = XigtCorpus() examined = 0 failures = 0 successes= 0 my_filter = '' for inst in xc: examined += 1 assert isinstance(inst, Igt) def fail(reason): nonlocal failures, my_filter my_filter = filter_string(inst).format("FAIL", '['+reason+']') failures += 1 FILTER_LOG.info(my_filter) def success(): nonlocal successes, my_filter my_filter = filter_string(inst).format("SUCCESS", "") successes += 1 def trytier(f): try: result = f(inst) except (NoNormLineException) as nnle: return None fail("Bad Lines") else: return result lt = trytier(lang) gt = trytier(gloss) tt = trytier(trans) if require_lang and lt is None: fail("LANG") continue if require_gloss and gt is None: fail("GLOSS") continue if require_trans and tt is None: fail("TRANS") continue if require_aln: if gt is None: fail("ALIGN-GLOSS") continue if lt is None: fail("ALIGN-LANG") continue try: word_align(gt, lt) except GlossLangAlignException: fail("ALIGN") continue if require_grammatical: if lt: grammatical_ll = [l for l in lang_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if gt: grammatical_gl = gloss_line(inst).get_attribute(ODIN_JUDGMENT_ATTRIBUTE) if tt: grammatical_tl = [l for l in trans_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if grammatical_ll or grammatical_gl or grammatical_tl: fail("UNGRAMMATICAL") continue if require_gloss_pos: if pos_tag_tier(inst, gt.id) is None: fail("GLOSS_POS") continue # Otherwise, attach to the new corpus. new_corp.append(inst) success() FILTER_LOG.info(my_filter) inst.sort_tiers() # ------------------------------------------- # Break out of the loop if we've hit the maximum # number of good instances. # ------------------------------------------- if max_instances and prev_good_instances+successes >= max_instances: break return new_corp, examined, successes, failures
def test_get_attribute(self): xc = XigtCorpus(attributes={'one': 1, 'two': 2}) self.assertEqual(xc.get_attribute('one'), 1) self.assertEqual(xc.get_attribute('two'), 2) self.assertIs(xc.get_attribute('three'), None) self.assertEqual(xc.get_attribute('three', inherit=True), None)
def test_append(self): xc = XigtCorpus() with pytest.raises(XigtStructureError): xc.append(Item()) with pytest.raises(XigtStructureError): xc.append(Tier()) with pytest.raises(XigtStructureError): xc.append(XigtCorpus()) with pytest.raises(XigtStructureError): xc.append(Metadata()) with pytest.raises(XigtStructureError): xc.append(Meta()) assert len(xc) == 0 xc.append(Igt(id='i1')) assert len(xc) == 1 with pytest.raises(XigtError): xc.append(Igt(id='i1')) xc.append(Igt(id='i2')) assert len(xc) == 2 assert xc[0].id == 'i1' assert xc[1].id == 'i2'