Exemplo n.º 1
0
def _process_file(f):
    c = TwoLevelCountDict()
    d = TwoLevelCountDict()
    m = TwoLevelCountDict()

    print("Processing file {}".format(f))
    xc = xc_load(f)
    for inst in xc:
        LOG.info("Now on instance {}".format(inst.id))

        # Search for the gloss POS tier, if it exists.
        gpos = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE)

        # If a gloss POS tier was found...
        if gpos:

            # Iterate through the projected tags.
            for gp in gpos:

                word = gp.igt.find(id=gp.attributes[ALIGNMENT])

                grams = tokenize_item(word, morpheme_tokenizer)

                # Add the (gram, POSTag) pair as something that was encountered.
                for gram in grams:
                    m.add(gram.content.lower(), gp.value())

                c.add(gp.value(), word.value().lower())
                d.add(word.value().lower(), gp.value())

    return (c, d, m)
Exemplo n.º 2
0
    def test_ds_cycle(self):
        """
        The tree in the ds_cycle file has "woman" depend both
        on "arriving" and "browse."
        """
        xc = xc_load(ds_cycle)
        inst = xc[0]

        #  1    2       4        5       7    8    9
        # The woman, (after) arriving, began to browse.

        # (The commas count as words, hence the skipping)

        tgt_t = DepTree.fromstring("""
        (ROOT[0]
            (began[7]
                (woman[2]
                    (The[1])
                    (\(after\)[4] (arriving[5])))
                (browse[9]
                    (woman[2])
                    (to[8])
                )
            ))
        """, stype=DEPSTR_PTB)

        ds = get_ds(inst, trans(inst))
        self.assertTrue(tgt_t.structurally_eq(ds))

        self.assertIsNone(project_ds_tier(inst))
Exemplo n.º 3
0
    def test_nogloss(self):
        xp = xigt_testfile('missing_line_tests.xml')
        xc = xc_load(xp)
        no_gloss = xc[0]

        self.assertRaises(NoGlossLineException, gloss_line, no_gloss)
        self.assertRaises(NoNormLineException, gloss_line, no_gloss)
Exemplo n.º 4
0
    def test_ds_project(self):
        xc = xc_load(os.path.join(testfile_dir, 'xigt/index_error.xml'), do_basic_processing=True)
        inst = xc[0]
        heur_align_inst(inst)
        parse_translation_line(inst, dt=True)
        project_ds_tier(inst)
        proj_t = get_lang_ds(inst)

        tgt_t = DepTree.fromstring("""(ROOT[0] (salli-i[2] (Jumala[1]) (sata-a[4] ([[3])) (rake-i-ta[5]) (ja[6]) (tuhka-a[7] (].[8]))))""", stype=DEPSTR_PTB)

        self.assertTrue(tgt_t.similar(proj_t))

        inst2 = xc[1]
        heur_align_inst(inst2)
        parse_translation_line(inst2, dt=True)
        project_ds_tier(inst2)

        print(inst2)

        tgt2_t = DepTree.fromstring("""(ROOT[0]
                                            (unohta-a[2] (*Minua[1])
                                                (unohda-n[4]
                                                    (/Minä[3])
                                                    (/laula-tta-a[6] (pelo-tta-a[5]))
                                                )
                                            ))
                                        """, stype=DEPSTR_PTB)

        self.assertTrue(get_lang_ds(inst2), tgt2_t)
Exemplo n.º 5
0
 def setUp(self):
     self.xc1 = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml'))
     self.inst = self.xc1[0]
     self.g1_2 = xigt_find(self.inst, id='g1.2')
     self.m2_1 = xigt_find(self.inst, id="m2.1")
     self.p1 = xigt_find(self.inst, id='p1')
     self.w1 = xigt_find(self.inst, id="w1")
     self.m_tier = xigt_find(self.inst, id="m")
     self.new_m = Item(id="m5", tier=self.m_tier, segmentation="w1[0:2]+w4[2:3]")
Exemplo n.º 6
0
def nfold_xaml():
    xaml_paths = glob("/Users/rgeorgi/Documents/code/dissertation/data/annotation/filtered/*.xml")

    lang_test = {}
    lang_train = {}
    lang_all  = {}

    tagger = StanfordPOSTagger(tagger_model)


    for xaml_path in xaml_paths:
        lang = os.path.basename(xaml_path)[:3]
        xc = xc_load(xaml_path)

        train, dev, test = split_instances(xc, train=0.5, test=0.5, dev=0.0)

        lang_train[lang] = train
        lang_all[lang] = train+test
        lang_test[lang] = test

    # Now, build our classifiers...

    all_other = POSEvalDict()
    all_all   = POSEvalDict()
    all_odin  = POSEvalDict()
    all_proj  = POSEvalDict()

    for lang in lang_all.keys():

        other_lang_instances = []
        all_lang_instances   = lang_train[lang]

        for other_lang in lang_all.keys():
            if other_lang != lang:
                other_lang_instances.extend(lang_all[other_lang])
                all_lang_instances.extend(lang_all[other_lang])

        other_lang_classifier = extract_from_instances(other_lang_instances, 'test.class', 'test.feats', '/dev/null')
        all_lang_classifier = extract_from_instances(all_lang_instances, 'all.class', 'all.feats', '/dev/null')


        test_instances = lang_test[lang]

        print(lang)
        prj_other_eval, cls_other_eval = evaluate_classifier_on_instances(test_instances, other_lang_classifier, tagger)
        prj_all_eval, cls_all_eval = evaluate_classifier_on_instances(test_instances, all_lang_classifier, tagger)
        prj_odin_eval, cls_odin_eval = evaluate_classifier_on_instances(test_instances, MalletMaxent('/Users/rgeorgi/Documents/code/dissertation/gc.classifier'), tagger)

        all_other += cls_other_eval
        all_all   += cls_all_eval
        all_odin  += cls_odin_eval
        all_proj  += prj_all_eval

    print('ALL')
    print('{:.2f},{:.2f},{:.2f},{:.2f},{:.2f}'.format(all_proj.precision(), all_proj.unaligned(), all_other.accuracy(), all_all.accuracy(), all_odin.accuracy()))
    print(all_proj.error_matrix(csv=True))
Exemplo n.º 7
0
    def test_filter_gloss_not_present(self):
        xp = xigt_testfile('missing_line_tests.xml')
        xc = xc_load(xp)

        test_xc, ex, fail, succ = filter_xc(xc, require_gloss=True)
        self.assertEqual(len(test_xc), 2)

        test_xc, ex, fail, succ = filter_xc(xc, require_gloss=False)
        self.assertEqual(len(test_xc), 3)

        test_xc, ex, fail, succ = filter_xc(xc, require_gloss=True, require_trans=True)
        self.assertEqual(len(test_xc), 1)

        test_xc, ex, fail, succ = filter_xc(xc, require_aln=True)
        self.assertEqual(len(test_xc), 0)

        test_xc, ex, fail, succ = filter_xc(xc, require_trans=True)
        self.assertEqual(len(test_xc), 2)
Exemplo n.º 8
0
def do_filter(filelist, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0):
    new_corp = XigtCorpus()

    FILTER_LOG.log(NORM_LEVEL, "Beginning filtering...")

    successes = 0
    failures  = 0
    examined  = 0

    for path in filelist:
        FILTER_LOG.log(1000, 'Opening file "{}" for filtering.'.format(os.path.basename(path)))
        xc = xc_load(path, mode=INCREMENTAL)
        instances, iter_examined, iter_success, iter_failures = filter_xc(xc, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances, successes)
        for instance in instances:
            new_corp.append(instance)

        successes += iter_success
        failures  += iter_failures
        examined  += iter_examined

    return new_corp, examined, failures, successes
Exemplo n.º 9
0
 def test_gloss_projection_unaligned(self):
     xc = xc_load(os.path.join(testfile_dir, "xigt/project_gloss_lang_tests.xml"))
     igt = xc[0]
     project_gloss_pos_to_lang(igt, tag_method=INTENT_POS_PROJ, unk_handling='keep')
     self.assertEqual('UNK', pos_tag_tier(igt, lang(igt).id, INTENT_POS_PROJ)[-1].value())
Exemplo n.º 10
0
 def setUp(self):
     self.xc = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml'))
Exemplo n.º 11
0
 def setUp(self):
     self.xp = xigt_testfile('multiple_alignments.xml')
     self.xc = xc_load(self.xp)
Exemplo n.º 12
0
"""
import os
from unittest import TestCase

from intent.alignment.Alignment import Alignment
from intent.consts import INTENT_ALN_HEUR, INTENT_ALN_GIZA, INTENT_POS_PROJ, INTENT_ALN_MANUAL
from intent.igt.create_tiers import lang, glosses, gloss, trans
from intent.igt.parsing import xc_load, parse_odin_inst
from intent.igt.references import xigt_find, item_index
from intent.igt.igt_functions import pos_tag_tier, project_gloss_pos_to_lang, giza_align_t_g, heur_align_corp, add_pos_tags, tier_tokens, classify_gloss_pos, tag_trans_pos, tier_text, set_bilingual_alignment, \
    get_trans_glosses_alignment, copy_xigt
from intent.interfaces.mallet_maxent import MalletMaxent
from intent.interfaces.stanford_tagger import StanfordPOSTagger
from intent.utils.env import posdict, tagger_model, testfile_dir, load_posdict

xc = xc_load(os.path.join(testfile_dir, "xigt/kor-ex.xml"))

class GlossAlignTest(TestCase):

    def test_gloss_projection_unaligned(self):
        xc = xc_load(os.path.join(testfile_dir, "xigt/project_gloss_lang_tests.xml"))
        igt = xc[0]
        project_gloss_pos_to_lang(igt, tag_method=INTENT_POS_PROJ, unk_handling='keep')
        self.assertEqual('UNK', pos_tag_tier(igt, lang(igt).id, INTENT_POS_PROJ)[-1].value())



#===============================================================================
# Unit Tests
#===============================================================================
Exemplo n.º 13
0
 def setUp(self):
     self.xc = xc_load(os.path.join(testfile_dir, 'xigt/morph_align_567.xml'))
Exemplo n.º 14
0
 def setUp(self):
     self.xc = xc_load(xigt_testfile('word_align.xml'))
Exemplo n.º 15
0
 def setUp(self):
     self.xc = xc_load(xigt_testfile('no_raw.xml'))
Exemplo n.º 16
0
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT,
                      cfg_path=None, tagger_prefix=None,
                      dep_prefix=None, pos_method=None, aln_method=None,
                      sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs):

    # ------- Dictionaries for keeping track of gloss_pos preprocessing. --------

    # This dictionary will first, be a list of "words" (full word-level)
    subword_dict = SubwordDict()

    # -------------------------------------------
    # Map the argument provided for "dep_pos" to
    # the alignment type that will be searched
    # -------------------------------------------
    use_pos = ARG_POS_MAP[pos_method]
    use_aln = ALN_ARG_MAP[aln_method]

    # -------------------------------------------
    # Get the tagset mapping if provided
    # -------------------------------------------
    tagpath = kwargs.get('tagmap')
    tm = None if tagpath is None else TagMap(tagpath)

    # =============================================================================
    # 1) SET UP
    # =============================================================================

    extracted_tagged_snts = 0
    extracted_parsed_snts = 0
    inst_count = 0


    if dep_prefix or tagger_prefix:
        if use_pos == ARG_POS_NONE:
            EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.')
        elif use_pos is None:
            EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.")
        else:
            EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos))


    # Set up the classifier....
    if classifier_prefix is not None:
        EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...")

    # Set up the tagger training file...
    if tagger_prefix is not None:
        tagger_train_path = tagger_prefix+'_tagger_train.txt'
        tagger_model_path = tagger_prefix+'.tagger'


        EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path))
        fileutils.makedirs(os.path.dirname(tagger_train_path))
        tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8')

    # Set up the dependency parser output if it's specified...
    dep_train_f = None
    dep_train_path = None
    if dep_prefix is not None:
        dep_train_path = dep_prefix+'_dep_train.txt'
        EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path))

        # Make the containing directory if it does not exist.
        fileutils.makedirs(os.path.dirname(dep_prefix))

        # Write out the training file.
        dep_train_f = open(dep_train_path, 'w', encoding='utf-8')

    # Set up the files for writing out alignment.
    if sent_prefix is not None:
        fileutils.makedirs(os.path.dirname(sent_prefix))
        e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8')
        f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8')

    # Set up the CFG path for writing.
    if cfg_path is not None:
        fileutils.makedirs(os.path.dirname(cfg_path))
        cfg_f = open(cfg_path, 'w', encoding='utf-8')

    # -------------------------------------------
    # Iterate over the provided files.
    # -------------------------------------------
    for path in input_filelist:
        xc = xc_load(path, mode=INCREMENTAL)

        # -------------------------------------------
        # Do the appropriate extraction for each
        # -------------------------------------------
        for inst in xc:
            inst_count += 1
            if tagger_prefix is not None:
                extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm)

            if dep_prefix is not None:
                extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm)

            if classifier_prefix is not None:
                gather_gloss_pos_stats(inst, subword_dict, classifier_feats)

            if sent_prefix is not None:
                try:
                    extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur,
                                            sent_type=sent_type, aln_method=use_aln)
                except NoNormLineException:
                    pass

            if cfg_path:
                extract_cfg_rules_from_inst(inst, cfg_f)

    # -------------------------------------------
    # After looping
    # -------------------------------------------

    EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count))

    # Add punctuation marks to the tagger.
    if tagger_prefix is not None:
        if extracted_tagged_snts == 0:
            EXTRACT_LOG.error("No tags were found. Not writing out file.")
            tagger_train_f.close()
            unlink(tagger_train_path)
        else:
            for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']:
                tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC'))
            tagger_train_f.close()
            EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path))
            # Now, train the POStagger...
            train_postagger(tagger_train_path, tagger_model_path)
            EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.")



    # =============================================================================
    # Classifier output...
    # =============================================================================

    if classifier_prefix is not None:

        # The path for the svm-light-based features.
        class_dir  = os.path.dirname(classifier_prefix)
        os.makedirs(class_dir, exist_ok=True)

        feat_path  =  classifier_prefix+'.feats.txt'
        class_path  = classifier_prefix+'.classifier'

        write_out_gram_dict(subword_dict, feat_path, classifier_feats)

        EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.")
        train_txt(feat_path, class_path)
        EXTRACT_LOG.log(NORM_LEVEL, "Complete.")

    if cfg_path:
        cfg_f.close()

    # -------------------------------------------
    # Train
    # -------------------------------------------
    if dep_prefix:
        if extracted_parsed_snts == 0:
            EXTRACT_LOG.error("No dependency parses were found. Not training parser.")
            dep_train_f.close()
            unlink(dep_train_path)
        else:
            EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts))
            dep_train_f.close()
            dep_parser_path = dep_prefix+'.depparser'
            mp = MSTParser()
            mp.train(dep_train_path, dep_parser_path)
Exemplo n.º 17
0
 def test_basic_processing(self):
     xc = xc_load(self.path, do_basic_processing=True)
Exemplo n.º 18
0
 def setUp(self):
     path = xigt_testfile('multiple_line_tests.xml')
     self.xc = xc_load(path)
Exemplo n.º 19
0
 def setUp(self):
     logging.basicConfig(level=logging.DEBUG)
     self.xc = xc_load(xigt_proj, do_basic_processing=False)
     self.inst1 = self.xc[0]
     self.inst2 = self.xc[1]
Exemplo n.º 20
0
def evaluate_intent(filelist, classifier_path=None, eval_alignment=None, eval_ds=None, eval_posproj=None,
                    classifier_feats=CLASS_FEATS_DEFAULT,
                    eval_tagger=None,
                    gold_tagmap=None, trans_tagmap=None, outpath=None):
    """
    Given a list of files that have manual POS tags and manual alignment,
    evaluate the various INTENT methods on that file.

    :param filelist: List of paths to evaluate against.
    :type filelist: list[str]
    :param classifier_path: Path to the classifier model
    :type classifier_path: str
    :param eval_alignment:
    """
    tagger = StanfordPOSTagger(tagger_model)

    outstream = sys.stdout
    if outpath is not None:
        outstream = open(outpath, mode='w', encoding='utf-8')

    # =============================================================================
    # Set up the objects to run as "servers"
    # =============================================================================

    classifier_obj = MalletMaxent(classifier)
    if classifier_path is not None:
        classifier_obj = MalletMaxent(classifier_path)

    class_matches, class_compares = 0, 0

    e_tagger = None
    if eval_tagger is not None:
        e_tagger = StanfordPOSTagger(eval_tagger)

    mas = MultAlignScorer()
    ds_plma = PerLangMethodAccuracies()
    pos_plma= PerLangMethodAccuracies()

    pos_pla = POSEvalDict()

    pos_proj_matrix = POSMatrix()
    pos_class_matrix = POSMatrix()

    # -------------------------------------------
    # If a tag map is specified, let's load it.
    # -------------------------------------------
    g_tm = TagMap(gold_tagmap) if gold_tagmap is not None else None
    t_tm = TagMap(trans_tagmap) if trans_tagmap is not None else None

    # Go through all the files in the list...
    for f in filelist:
        outstream.write('Evaluating on file: {}\n'.format(f))
        xc = xc_load(f, mode=FULL)
        lang = os.path.basename(f)

        # -------------------------------------------
        # Test the classifier if evaluation is requested.
        # -------------------------------------------
        if classifier_path is not None:
            matches, compares, acc = evaluate_classifier_on_instances(xc, classifier_obj, classifier_feats,
                                                                      pos_class_matrix, gold_tagmap=g_tm)
            outstream.write('{},{},{},{:.2f}\n'.format(lang, matches, compares, acc))
            class_matches += matches
            class_compares += compares

        # -------------------------------------------
        # Test alignment if requested.
        # -------------------------------------------
        if eval_alignment:
            mas.add_corpus('gold', INTENT_ALN_MANUAL, lang, xc)
            EVAL_LOG.log(NORM_LEVEL, "Evaluating heuristic methods...")
            evaluate_heuristic_methods_on_file(f, xc, mas, classifier_obj, tagger, lang)

            EVAL_LOG.log(NORM_LEVEL, "Evaluating statistical methods...")
            evaluate_statistic_methods_on_file(f, xc, mas, classifier_obj, tagger, lang)

        # -------------------------------------------
        # Test DS Projection if requested
        # -------------------------------------------
        if eval_ds:
            evaluate_ds_projections_on_file(lang, xc, ds_plma, outstream=outstream)
            outstream.write('{}\n'.format(ds_plma))

        # -------------------------------------------
        #  Test POS Projection
        # -------------------------------------------
        if eval_posproj:
            evaluate_pos_projections_on_file(lang, xc, pos_plma, pos_proj_matrix, tagger, gold_tagmap=g_tm, trans_tagmap=t_tm, outstream=outstream)

        if e_tagger is not None:
            evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=g_tm, outstream=outstream)



    if eval_alignment:
        mas.eval_all(outstream=outstream)

    if eval_ds:
        outstream.write('{}\n'.format(ds_plma))

    if e_tagger is not None:
        outstream.write('{},{},{},{:.2f}\n'.format(lang, pos_pla.all_matches(), pos_pla.fulltotal(), pos_pla.accuracy()))
        e_tagger.close()

    # Report the POS tagging accuracy...
    if classifier_path is not None:
        outstream.write("ALL...\n")
        outstream.write('{},{},{:.2f}\n'.format(class_matches, class_compares, class_matches/class_compares*100))
        outstream.write('{}\n'.format(pos_class_matrix))

    if eval_posproj:
        outstream.write('{}\n'.format(pos_proj_matrix))

    outstream.close()
Exemplo n.º 21
0
def produce_tagger(inpath, out_f, method, kwargs = None):

    if kwargs.get('xc'):
        xc = kwargs.get('xc')
    else:
        # Load the xigt corpus.
        xc = xc_load(inpath)

    corp_length = len(xc)

    # Before reducing the size of the corpus, filter out
    # instances lacking g/t alignment for classification and projection...
    if method == classification or method in normal_proj:
        xc.require_one_to_one()
        corp_length = len(xc)

    # Also, filter out instances where a translation line is missing
    # if we are projecting. (This overlaps with the above, but leaves
    # direct giza alignments to not require one to one alignment.)
    if method in projection:
        xc.require_trans_lines()
        corp_length = len(xc)



    limit = kwargs.get('limit', 0, int)
    if limit:
        xc.igts = xc.igts[:limit]
        corp_length = len(xc)


    # Giza Realignment ---------------------------------------------------------
    # If we are using a giza based approach, we will want to
    # realign the corpus now, since it is heuristic by default.
    if method == giza_proj:
        xc.giza_align_t_g(kwargs.get('resume'))

    elif method == giza_direct:
        xc.giza_align_l_t()

    TAGLOG.info('Producing tagfile for "%s"' % os.path.relpath(out_f.name))

    #===========================================================================
    # ADD PUNC
    #===========================================================================
    out_f.write('''./PUNC
?/PUNC
“/PUNC
"/PUNC
''/PUNC
'/PUNC
,/PUNC
…/PUNC
//PUNC
--/PUNC
``/PUNC
:/PUNC
;/PUNC
«/PUNC
»/PUNC
-/PUNC\n''')

    for i, inst in enumerate(xc):

        if i % 25 == 0:
            TAGLOG.info('Processing instance %d' % i)

        # If we are doing classification
        if method == classification:
            inst.classify_gloss_pos(kwargs.get('classifier'), posdict=kwargs.get('posdict'))
            inst.project_gloss_to_lang()

        # If we are doing normal projection via the gloss line
        elif method in normal_proj:
            try:
                inst.project_trans_to_gloss()
            except ProjectionTransGlossException as ptge:
                TAGLOG.warn(ptge)
                continue
            inst.project_gloss_to_lang()


        # Otherwise, we are looking at doing the direct translation
        # to language based approach.
        elif method == giza_direct:
            inst.project_trans_to_lang()

        # Raise an exception if we somehow got a different method.
        else:
            raise TagProductionException('Method "%s" is not defined for producing taggers.' % method)


        # Whichever method, get the gloss line tags:
        sequence = inst.get_lang_sequence()


        # If we get a "skip" and "UNK" appears in the sequence...
        if kwargs.get('skip') and len(sequence) != len([i for i in sequence if i.label != UNK]):
            corp_length -= 1
            continue

        else:
            # Replace the "UNK" with "NOUN"
            for i, pos_token in enumerate(sequence):
                if pos_token.label == 'UNK' and kwargs.get('unk_nouns'):
                    pos_token.label = "NOUN"
                elif pos_token.label == 'UNK' and kwargs.get('unk_classify'):
                    classifier = kwargs.get('classifier')

                    kwargs['prev_gram'] = ''
                    kwargs['next_gram'] = ''

                    if i > 0:
                        kwargs['prev_gram'] = inst.gloss[i-1].get_content()
                    if i < len(inst.gloss)-1:
                        kwargs['next_gram'] = inst.gloss[i+1].get_content()

                    pos_token.label = classifier.classify_string(inst.gloss[i].get_content(), **kwargs).largest()[0]


                out_f.write('%s/%s ' % (pos_token.seq, pos_token.label))
            out_f.write('\n')
            out_f.flush()

    out_f.close()
    return corp_length
Exemplo n.º 22
0
    # # Get the language line words projected onto the gloss...
    # for inst in train_xc:
    #     word_align(inst.gloss, inst.lang)
    #     inst.project_lang_to_gloss(tagmap = './data/tagset_mappings/ctn.txt')
    #     inst.tag_trans_pos(tagger)
    #     inst.heur_align()
    #     inst.project_trans_to_gloss()
    #     fix_ctn_gloss_line(inst, tag_method=INTENT_POS_PROJ)
    #
    # print("Done.")
    #
    # xigtxml.dump(open(ctn_train_processed, 'w', encoding='utf-8'), train_xc)
    # sys.exit()

    print("Loading Processed CTN Train corpus...", end=" ", flush=True)
    train_xc    = xc_load(ctn_train_processed)
    print("Done.")

    print("Loading Processed CTN Dev corpus...", end=" ", flush=True)
    dev_xc    = xc_load(ctn_dev_processed)
    print("Done.")

    #
    # # =============================================================================
    # # 2) Train a classifier based on the projected gloss line.
    # # =============================================================================
    #

    index_list = [35,70,106,141,284,569,854,1139,1424,1708,1993,7120]

    for train_stop_index in index_list:
Exemplo n.º 23
0
    def setUp(self):

        self.xc = xc_load(dep_file)
        self.inst = self.xc[3]
Exemplo n.º 24
0
 def test_inst_2(self):
     xp = xigt_testfile('xigt-projection-tests.xml')
     xc = xc_load(xp)
     do_projection(**{ARG_INFILE:xp, 'aln_method':ARG_ALN_GIZA, ARG_OUTFILE:'/dev/null'})
Exemplo n.º 25
0
    def broken_german_test(self):

        xc = xc_load(os.path.join(testfile_dir, 'xigt/broken-german-instance.xml'))
        inst = xc[0]
        self.assertIsNotNone(classify_gloss_pos(inst))