예제 #1
0
파일: spantest.py 프로젝트: rgeorgi/intent
 def setUp(self):
     self.xc1 = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml'))
     self.inst = self.xc1[0]
     self.g1_2 = xigt_find(self.inst, id='g1.2')
     self.m2_1 = xigt_find(self.inst, id="m2.1")
     self.p1 = xigt_find(self.inst, id='p1')
     self.w1 = xigt_find(self.inst, id="w1")
     self.m_tier = xigt_find(self.inst, id="m")
     self.new_m = Item(id="m5", tier=self.m_tier, segmentation="w1[0:2]+w4[2:3]")
예제 #2
0
def evaluate_instance(inst, classifier, tagger):
    # Get the supervised POS tags...
    """

    :param inst:
    :type inst: RGIgt
    :param classifier: MalletMaxent
    :param tagger: StanfordPOSTagger
    """
    sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID)  # We will incrementally build up the tag sequences...
    sup_lang_tier  = pos_tag_tier(inst, LANG_WORD_ID)


    sup_tags = []
    prj_tags = []
    cls_tags = []

    # If there are no supervised tags on the gloss line, but there are on the language line...
    if sup_gloss_tier is None and sup_lang_tier is not None:
        try:
            add_gloss_lang_alignments(inst)
            project_lang_to_gloss(inst)
            sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID)
        except RGXigtException:
            pass

    if sup_gloss_tier:

        # Do the classification
        classify_gloss_pos(inst, classifier)
        cls_tier = pos_tag_tier(inst, GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)

        for sup_item in sup_gloss_tier:
            word = xigt_find(inst, id=sup_item.alignment)
            if not word:
                continue
            else:
                word = word.value()

            # prj_item = xigt_find(prj_tier, alignment=sup_item.alignment)
            # if prj_item is None:
            #     prj_tag = 'UNK'
            # else:
            #     prj_tag = prj_item.value()

            cls_item = xigt_find(cls_tier, alignment=sup_item.alignment)
            if cls_item is None:
                cls_tag = 'UNK'
            else:
                cls_tag = cls_item.value()

            sup_tags.append(POSToken(word, label=sup_item.value()))
            # prj_tags.append(POSToken(word, label=prj_tag))
            cls_tags.append(POSToken(word, label=cls_tag))

    return sup_tags, cls_tags
예제 #3
0
def evaluate_classifier_on_instances(inst_list, classifier, feat_list, pos_class_matrix, gold_tagmap=None):
    """
    Given a list of instances, do the evaluation on them.

    :param inst_list:
    :param classifier:
    :param tagger:
    :return:
    """

    pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False

    matches = 0
    compares = 0

    for inst in inst_list:
        sup_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_MANUAL)
        if sup_postier is None:
            continue
        gw_tier = gloss(inst)
        classify_gloss_pos(inst, classifier,
                           posdict=pd,
                           feat_prev_gram=CLASS_FEATS_PRESW in feat_list,
                           feat_next_gram=CLASS_FEATS_NEXSW in feat_list,
                           feat_dict=CLASS_FEATS_DICT in feat_list,
                           feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list,
                           feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list,
                           feat_suffix=CLASS_FEATS_SUF in feat_list,
                           feat_prefix=CLASS_FEATS_PRE in feat_list,
                           feat_morph_num=CLASS_FEATS_NUMSW in feat_list,
                           feat_has_number=CLASS_FEATS_NUM in feat_list,
                           feat_basic=CLASS_FEATS_SW in feat_list)


        cls_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_CLASS)


        for cls_tag in cls_postier:
            word = xigt_find(gw_tier, id=cls_tag.alignment)
            sup_tag = xigt_find(sup_postier, alignment=cls_tag.alignment)

            if sup_tag is None:
                continue
            else:
                sup_tag_v = sup_tag.value()
                if gold_tagmap is not None:
                    sup_tag_v = gold_tagmap.get(sup_tag_v)

            pos_class_matrix.add(sup_tag_v, cls_tag.value())
            if cls_tag.value() == sup_tag_v:
                matches += 1
            compares += 1

    return matches, compares, matches/compares*100
예제 #4
0
파일: tests.py 프로젝트: rgeorgi/intent
    def word_align_test(self):
        """
        Test that the gloss has been automatically aligned at the word level correctly.
        """
        at = Alignment()
        for gw in gloss(self.igt):
            gw_idx = item_index(gw)
            lw = xigt_find(self.igt, id=gw.alignment)
            if lw is not None:
                at.add((gw_idx, item_index(lw)))


        self.assertEqual(at, Alignment([(1,1),(2,2),(3,3),(4,4)]))
예제 #5
0
def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm):
    """
    Given an instance, retrieve the language-line words and POS tags.

    :param inst:
    :param output_stream:
    :param pos_source:
    """
    lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source)
    lang_words     = lang(inst)

    training_sentences = 0

    # -------------------------------------------
    # Only try extracting if there are in fact valid POS tags.
    # -------------------------------------------
    if lang_pos_tags:

        first = True
        for lang_word in lang_words:

            lang_pos_tag = None
            if lang_pos_tags is not None:
                lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id)

            tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word)
            if tag_string and tm:
                tag_string = tm[tag_string]

            word_string = lang_word.value()

            # -------------------------------------------
            # Do some cleaning on the output words
            # -------------------------------------------
            word_string = clean_lang_token(word_string, lowercase=True)

            # For every instance after the first,
            # add a space.
            out_str = ' {}/{}'
            if first:
                first = False
                out_str = out_str.strip()

            output_stream.write(out_str.format(word_string, tag_string))
        output_stream.write('\n')
        output_stream.flush()
        training_sentences += 1

    return training_sentences
예제 #6
0
def gather_gloss_pos_stats(inst, subword_dict, feat_list):
    """
    Given an instance, look for the gloss pos tags, and save the statistics
    about them, so that we can filter by the number of times each kind was
    seen later.

    :param inst: Instance to process.
    :type inst: RGIgt
    :param subword_dict: This dictionary will record the number of times each (word, TAG)
                          pair has been seen.
    :type subword_dict: SubwordDict
    :param gram_tag_dict: This dictionary will record the number of times individual grams are seen.
    :type gram_tag_dict: TwoLevelCountDict
    """

    # Grab the gloss POS tier...
    gpos_tier = gloss_tag_tier(inst)
    lpos_tier = lang_tag_tier(inst)
    gw_tier = gloss(inst)

    if CLASS_FEATS_ALN in feat_list:
        heur_align_inst(inst)
        get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR)

    # If there are POS tags on the language line but not the gloss line...
    if gpos_tier is None and lpos_tier is not None:
        add_gloss_lang_alignments(inst)
        project_lang_to_gloss(inst)
        gpos_tier = gloss_tag_tier(inst)


    # If this tier exists, then let's process it.
    if gpos_tier is not None:

        # Iterate over each gloss POS tag...
        for i, gw in enumerate(gw_tier):
            tag = xigt_find(inst, alignment=gw.id)

            if tag is None:
                continue

            prev_word = gw_tier[i-1].value().lower() if i > 0 else None
            next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None

            if CLASS_FEATS_ALN in feat_list:
                subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)
예제 #7
0
def evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=None, outstream=sys.stdout):
    """

    :type pos_pla: POSEvalDict
    """
    matches = 0
    compares = 0

    # Iterate through each instance in the corpus.
    for inst in xc:
        gold_tag_tier = lang_tag_tier(inst, INTENT_POS_MANUAL)

        # If there are no gold tags for this instance, skip it.
        if gold_tag_tier is None:
            continue

        # Create the eval tag tier and retrieve it
        tag_lang_pos(inst, e_tagger)
        eval_tag_tier = lang_tag_tier(inst, INTENT_POS_TAGGER)

        # For each gold tag...
        for gold_tag in gold_tag_tier:

            # Find it's matching tag on the eval side, and compare.
            eval_tag = xigt_find(eval_tag_tier, alignment=gold_tag.alignment)
            gold_tag_v = gold_tag.value()

            if gold_tag_v is not None:
                if gold_tagmap:
                    try:
                        gold_tag_v = gold_tagmap.get(gold_tag_v)
                    except TagMapException:
                        pass

                if gold_tag_v != 'JUNK':
                    if gold_tag_v == eval_tag.value():
                        matches += 1
                    compares += 1

                pos_pla.add(gold_tag.value(), eval_tag.value())
    return matches, compares
예제 #8
0
def evaluate_pos_projections_on_file(lang, xc, plma, pos_proj_matrix, tagger, gold_tagmap=None, trans_tagmap=None, outstream=sys.stdout):
    """
    :type plma: PerLangMethodAccuracies
    :type pos_proj_matrix: POSMatrix
    """
    new_xc = XigtCorpus(xc.id)
    for inst in xc:

        gtt = gloss_tag_tier(inst, INTENT_POS_MANUAL)
        ttt = trans_tag_tier(inst, INTENT_POS_MANUAL)
        m_aln = get_trans_gloss_alignment(inst, INTENT_ALN_MANUAL)

        # Only continue if we have manual gloss tags, trans tags, and manual alignment.
        if gtt is None or m_aln is None or ttt is None:
            continue

        # Get the heuristic alignment...
        h_aln = heur_align_inst(inst)

        # And tag the translation line.
        tag_trans_pos(inst, tagger=tagger)

        # Now, iterate through each alignment method and set of tags.
        for aln_method in [INTENT_ALN_MANUAL, INTENT_ALN_HEUR]:
            for trans_tag_method in [INTENT_POS_MANUAL, INTENT_POS_TAGGER]:
                project_trans_pos_to_gloss(inst, aln_method=aln_method, trans_tag_method=trans_tag_method)
                proj_gtt = gloss_tag_tier(inst, tag_method=INTENT_POS_PROJ)

                # Go through each word in the gloss line and, if it has a gold
                # tag, was it correct?
                matches = 0
                compares = 0
                for gw in gloss(inst):
                    gold_tag = xigt_find(gtt, alignment=gw.id)
                    proj_tag = xigt_find(proj_gtt, alignment=gw.id)

                    if gold_tag is not None:
                        gold_tag_v = gold_tag.value()

                        # Remap the tags if asked...
                        if gold_tagmap is not None:
                            try:
                                gold_tag_v = gold_tagmap.get(gold_tag_v)
                            except TagMapException:
                                pass

                        if proj_tag is None:
                            proj_str = '**UNK'
                        else:
                            proj_str = proj_tag.value()
                            if trans_tagmap is not None:
                                # Try to remap the tag, but keep it if it can't be remapped.
                                try:
                                    proj_str = trans_tagmap.get(proj_str)
                                except TagMapException:
                                    pass

                        pos_proj_matrix.add(gold_tag_v, proj_str)

                        if proj_tag is not None and proj_str == gold_tag_v:
                            matches += 1
                        compares += 1


                plma.add(lang, '{}:{}'.format(aln_method, trans_tag_method), matches, compares)

    outstream.write('{}\n'.format(plma))





    return new_xc