def setUp(self): self.xc1 = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml')) self.inst = self.xc1[0] self.g1_2 = xigt_find(self.inst, id='g1.2') self.m2_1 = xigt_find(self.inst, id="m2.1") self.p1 = xigt_find(self.inst, id='p1') self.w1 = xigt_find(self.inst, id="w1") self.m_tier = xigt_find(self.inst, id="m") self.new_m = Item(id="m5", tier=self.m_tier, segmentation="w1[0:2]+w4[2:3]")
def evaluate_instance(inst, classifier, tagger): # Get the supervised POS tags... """ :param inst: :type inst: RGIgt :param classifier: MalletMaxent :param tagger: StanfordPOSTagger """ sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID) # We will incrementally build up the tag sequences... sup_lang_tier = pos_tag_tier(inst, LANG_WORD_ID) sup_tags = [] prj_tags = [] cls_tags = [] # If there are no supervised tags on the gloss line, but there are on the language line... if sup_gloss_tier is None and sup_lang_tier is not None: try: add_gloss_lang_alignments(inst) project_lang_to_gloss(inst) sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID) except RGXigtException: pass if sup_gloss_tier: # Do the classification classify_gloss_pos(inst, classifier) cls_tier = pos_tag_tier(inst, GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS) for sup_item in sup_gloss_tier: word = xigt_find(inst, id=sup_item.alignment) if not word: continue else: word = word.value() # prj_item = xigt_find(prj_tier, alignment=sup_item.alignment) # if prj_item is None: # prj_tag = 'UNK' # else: # prj_tag = prj_item.value() cls_item = xigt_find(cls_tier, alignment=sup_item.alignment) if cls_item is None: cls_tag = 'UNK' else: cls_tag = cls_item.value() sup_tags.append(POSToken(word, label=sup_item.value())) # prj_tags.append(POSToken(word, label=prj_tag)) cls_tags.append(POSToken(word, label=cls_tag)) return sup_tags, cls_tags
def evaluate_classifier_on_instances(inst_list, classifier, feat_list, pos_class_matrix, gold_tagmap=None): """ Given a list of instances, do the evaluation on them. :param inst_list: :param classifier: :param tagger: :return: """ pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False matches = 0 compares = 0 for inst in inst_list: sup_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_MANUAL) if sup_postier is None: continue gw_tier = gloss(inst) classify_gloss_pos(inst, classifier, posdict=pd, feat_prev_gram=CLASS_FEATS_PRESW in feat_list, feat_next_gram=CLASS_FEATS_NEXSW in feat_list, feat_dict=CLASS_FEATS_DICT in feat_list, feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list, feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list, feat_suffix=CLASS_FEATS_SUF in feat_list, feat_prefix=CLASS_FEATS_PRE in feat_list, feat_morph_num=CLASS_FEATS_NUMSW in feat_list, feat_has_number=CLASS_FEATS_NUM in feat_list, feat_basic=CLASS_FEATS_SW in feat_list) cls_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_CLASS) for cls_tag in cls_postier: word = xigt_find(gw_tier, id=cls_tag.alignment) sup_tag = xigt_find(sup_postier, alignment=cls_tag.alignment) if sup_tag is None: continue else: sup_tag_v = sup_tag.value() if gold_tagmap is not None: sup_tag_v = gold_tagmap.get(sup_tag_v) pos_class_matrix.add(sup_tag_v, cls_tag.value()) if cls_tag.value() == sup_tag_v: matches += 1 compares += 1 return matches, compares, matches/compares*100
def word_align_test(self): """ Test that the gloss has been automatically aligned at the word level correctly. """ at = Alignment() for gw in gloss(self.igt): gw_idx = item_index(gw) lw = xigt_find(self.igt, id=gw.alignment) if lw is not None: at.add((gw_idx, item_index(lw))) self.assertEqual(at, Alignment([(1,1),(2,2),(3,3),(4,4)]))
def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm): """ Given an instance, retrieve the language-line words and POS tags. :param inst: :param output_stream: :param pos_source: """ lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source) lang_words = lang(inst) training_sentences = 0 # ------------------------------------------- # Only try extracting if there are in fact valid POS tags. # ------------------------------------------- if lang_pos_tags: first = True for lang_word in lang_words: lang_pos_tag = None if lang_pos_tags is not None: lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id) tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word) if tag_string and tm: tag_string = tm[tag_string] word_string = lang_word.value() # ------------------------------------------- # Do some cleaning on the output words # ------------------------------------------- word_string = clean_lang_token(word_string, lowercase=True) # For every instance after the first, # add a space. out_str = ' {}/{}' if first: first = False out_str = out_str.strip() output_stream.write(out_str.format(word_string, tag_string)) output_stream.write('\n') output_stream.flush() training_sentences += 1 return training_sentences
def gather_gloss_pos_stats(inst, subword_dict, feat_list): """ Given an instance, look for the gloss pos tags, and save the statistics about them, so that we can filter by the number of times each kind was seen later. :param inst: Instance to process. :type inst: RGIgt :param subword_dict: This dictionary will record the number of times each (word, TAG) pair has been seen. :type subword_dict: SubwordDict :param gram_tag_dict: This dictionary will record the number of times individual grams are seen. :type gram_tag_dict: TwoLevelCountDict """ # Grab the gloss POS tier... gpos_tier = gloss_tag_tier(inst) lpos_tier = lang_tag_tier(inst) gw_tier = gloss(inst) if CLASS_FEATS_ALN in feat_list: heur_align_inst(inst) get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR) # If there are POS tags on the language line but not the gloss line... if gpos_tier is None and lpos_tier is not None: add_gloss_lang_alignments(inst) project_lang_to_gloss(inst) gpos_tier = gloss_tag_tier(inst) # If this tier exists, then let's process it. if gpos_tier is not None: # Iterate over each gloss POS tag... for i, gw in enumerate(gw_tier): tag = xigt_find(inst, alignment=gw.id) if tag is None: continue prev_word = gw_tier[i-1].value().lower() if i > 0 else None next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None if CLASS_FEATS_ALN in feat_list: subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)
def evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=None, outstream=sys.stdout): """ :type pos_pla: POSEvalDict """ matches = 0 compares = 0 # Iterate through each instance in the corpus. for inst in xc: gold_tag_tier = lang_tag_tier(inst, INTENT_POS_MANUAL) # If there are no gold tags for this instance, skip it. if gold_tag_tier is None: continue # Create the eval tag tier and retrieve it tag_lang_pos(inst, e_tagger) eval_tag_tier = lang_tag_tier(inst, INTENT_POS_TAGGER) # For each gold tag... for gold_tag in gold_tag_tier: # Find it's matching tag on the eval side, and compare. eval_tag = xigt_find(eval_tag_tier, alignment=gold_tag.alignment) gold_tag_v = gold_tag.value() if gold_tag_v is not None: if gold_tagmap: try: gold_tag_v = gold_tagmap.get(gold_tag_v) except TagMapException: pass if gold_tag_v != 'JUNK': if gold_tag_v == eval_tag.value(): matches += 1 compares += 1 pos_pla.add(gold_tag.value(), eval_tag.value()) return matches, compares
def evaluate_pos_projections_on_file(lang, xc, plma, pos_proj_matrix, tagger, gold_tagmap=None, trans_tagmap=None, outstream=sys.stdout): """ :type plma: PerLangMethodAccuracies :type pos_proj_matrix: POSMatrix """ new_xc = XigtCorpus(xc.id) for inst in xc: gtt = gloss_tag_tier(inst, INTENT_POS_MANUAL) ttt = trans_tag_tier(inst, INTENT_POS_MANUAL) m_aln = get_trans_gloss_alignment(inst, INTENT_ALN_MANUAL) # Only continue if we have manual gloss tags, trans tags, and manual alignment. if gtt is None or m_aln is None or ttt is None: continue # Get the heuristic alignment... h_aln = heur_align_inst(inst) # And tag the translation line. tag_trans_pos(inst, tagger=tagger) # Now, iterate through each alignment method and set of tags. for aln_method in [INTENT_ALN_MANUAL, INTENT_ALN_HEUR]: for trans_tag_method in [INTENT_POS_MANUAL, INTENT_POS_TAGGER]: project_trans_pos_to_gloss(inst, aln_method=aln_method, trans_tag_method=trans_tag_method) proj_gtt = gloss_tag_tier(inst, tag_method=INTENT_POS_PROJ) # Go through each word in the gloss line and, if it has a gold # tag, was it correct? matches = 0 compares = 0 for gw in gloss(inst): gold_tag = xigt_find(gtt, alignment=gw.id) proj_tag = xigt_find(proj_gtt, alignment=gw.id) if gold_tag is not None: gold_tag_v = gold_tag.value() # Remap the tags if asked... if gold_tagmap is not None: try: gold_tag_v = gold_tagmap.get(gold_tag_v) except TagMapException: pass if proj_tag is None: proj_str = '**UNK' else: proj_str = proj_tag.value() if trans_tagmap is not None: # Try to remap the tag, but keep it if it can't be remapped. try: proj_str = trans_tagmap.get(proj_str) except TagMapException: pass pos_proj_matrix.add(gold_tag_v, proj_str) if proj_tag is not None and proj_str == gold_tag_v: matches += 1 compares += 1 plma.add(lang, '{}:{}'.format(aln_method, trans_tag_method), matches, compares) outstream.write('{}\n'.format(plma)) return new_xc