def write_instances(instance_list, out_path, type, overwrite=False): if os.path.exists(out_path) and not overwrite: SPLIT_LOG.error('File "{}" already exists and overwrite flag not set. Skipping!'.format(out_path)) return else: # Create the directory if need be try: if not os.path.exists(os.path.dirname(out_path)): os.makedirs(os.path.dirname(out_path)) except FileNotFoundError: pass num_sents = len(instance_list) if num_sents > 0: xc = XigtCorpus() for i, inst in enumerate(instance_list): # inst.id = 'i{}'.format(i) xc.append(inst) print("Writing {} instances to {}...".format(num_sents, out_path)) f = open(out_path, 'w', encoding='utf-8') sort_corpus(xc) xigtxml.dump(f, xc) f.close() else: SPLIT_LOG.warn("No instances allocated for {}. Skipping file.".format(type))
def test_append(self): xc = XigtCorpus() self.assertRaises(XigtStructureError, xc.append, Item()) self.assertRaises(XigtStructureError, xc.append, Tier()) self.assertRaises(XigtStructureError, xc.append, XigtCorpus()) self.assertRaises(XigtStructureError, xc.append, Metadata()) self.assertRaises(XigtStructureError, xc.append, Meta()) self.assertEqual(len(xc), 0) xc.append(Igt(id='i1')) self.assertEqual(len(xc), 1) self.assertRaises(XigtError, xc.append, Igt(id='i1')) xc.append(Igt(id='i2')) self.assertEqual(len(xc), 2) self.assertEqual(xc[0].id, 'i1') self.assertEqual(xc[1].id, 'i2')
def eval_classifier(c, inst_list, context_feats=False, posdict=None): """ :param c: The classifier :param inst_list: A list of Igt instances to test against. Must already have POS tags. """ gold_sents = [] eval_sents = [] to_dump = XigtCorpus() for inst in inst_list: to_tag = inst.copy() strip_pos(to_tag) # Do the classification. to_tag.classify_gloss_pos(c, lowercase=True, feat_next_gram=context_feats, feat_prev_gram=context_feats, posdict=posdict) to_dump.append(to_tag) # Fix the tags... # fix_ctn_gloss_line(to_tag, tag_method=INTENT_POS_CLASS) # Now, retrieve eval/gold. eval_tags = [v.value() for v in to_tag.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)] gold_tags = [v.value() for v in inst.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_MANUAL)] tag_tokens = [POSToken('a', label=l) for l in eval_tags] gold_tokens= [POSToken('a', label=l) for l in gold_tags] if not len(tag_tokens) == len(gold_tokens): print("LENGTH OF SEQUENCE IS MISMATCHED") continue gold_sents.append(gold_tokens) eval_sents.append(tag_tokens) xigtxml.dump(open('./enriched_ctn_dev.xml', 'w'), to_dump) return poseval(eval_sents, gold_sents, details=True,csv=True, matrix=True)
def do_filter(filelist, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0): new_corp = XigtCorpus() FILTER_LOG.log(NORM_LEVEL, "Beginning filtering...") successes = 0 failures = 0 examined = 0 for path in filelist: FILTER_LOG.log(1000, 'Opening file "{}" for filtering.'.format(os.path.basename(path))) xc = xc_load(path, mode=INCREMENTAL) instances, iter_examined, iter_success, iter_failures = filter_xc(xc, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances, successes) for instance in instances: new_corp.append(instance) successes += iter_success failures += iter_failures examined += iter_examined return new_corp, examined, failures, successes
def filter_xc(xc, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0, prev_good_instances=0): new_corp = XigtCorpus() examined = 0 failures = 0 successes= 0 my_filter = '' for inst in xc: examined += 1 assert isinstance(inst, Igt) def fail(reason): nonlocal failures, my_filter my_filter = filter_string(inst).format("FAIL", '['+reason+']') failures += 1 FILTER_LOG.info(my_filter) def success(): nonlocal successes, my_filter my_filter = filter_string(inst).format("SUCCESS", "") successes += 1 def trytier(f): try: result = f(inst) except (NoNormLineException) as nnle: return None fail("Bad Lines") else: return result lt = trytier(lang) gt = trytier(gloss) tt = trytier(trans) if require_lang and lt is None: fail("LANG") continue if require_gloss and gt is None: fail("GLOSS") continue if require_trans and tt is None: fail("TRANS") continue if require_aln: if gt is None: fail("ALIGN-GLOSS") continue if lt is None: fail("ALIGN-LANG") continue try: word_align(gt, lt) except GlossLangAlignException: fail("ALIGN") continue if require_grammatical: if lt: grammatical_ll = [l for l in lang_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if gt: grammatical_gl = gloss_line(inst).get_attribute(ODIN_JUDGMENT_ATTRIBUTE) if tt: grammatical_tl = [l for l in trans_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if grammatical_ll or grammatical_gl or grammatical_tl: fail("UNGRAMMATICAL") continue if require_gloss_pos: if pos_tag_tier(inst, gt.id) is None: fail("GLOSS_POS") continue # Otherwise, attach to the new corpus. new_corp.append(inst) success() FILTER_LOG.info(my_filter) inst.sort_tiers() # ------------------------------------------- # Break out of the loop if we've hit the maximum # number of good instances. # ------------------------------------------- if max_instances and prev_good_instances+successes >= max_instances: break return new_corp, examined, successes, failures
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def test_append(self): xc = XigtCorpus() with pytest.raises(XigtStructureError): xc.append(Item()) with pytest.raises(XigtStructureError): xc.append(Tier()) with pytest.raises(XigtStructureError): xc.append(XigtCorpus()) with pytest.raises(XigtStructureError): xc.append(Metadata()) with pytest.raises(XigtStructureError): xc.append(Meta()) assert len(xc) == 0 xc.append(Igt(id='i1')) assert len(xc) == 1 with pytest.raises(XigtError): xc.append(Igt(id='i1')) xc.append(Igt(id='i2')) assert len(xc) == 2 assert xc[0].id == 'i1' assert xc[1].id == 'i2'