예제 #1
0
def write_instances(instance_list, out_path, type, overwrite=False):

    if os.path.exists(out_path) and not overwrite:
        SPLIT_LOG.error('File "{}" already exists and overwrite flag not set. Skipping!'.format(out_path))
        return
    else:

        # Create the directory if need be
        try:
            if not os.path.exists(os.path.dirname(out_path)):
                os.makedirs(os.path.dirname(out_path))
        except FileNotFoundError:
            pass


        num_sents = len(instance_list)
        if num_sents > 0:
            xc = XigtCorpus()
            for i, inst in enumerate(instance_list):
                # inst.id = 'i{}'.format(i)
                xc.append(inst)

            print("Writing {} instances to {}...".format(num_sents, out_path))
            f = open(out_path, 'w', encoding='utf-8')
            sort_corpus(xc)
            xigtxml.dump(f, xc)
            f.close()
        else:
            SPLIT_LOG.warn("No instances allocated for {}. Skipping file.".format(type))
예제 #2
0
 def test_append(self):
     xc = XigtCorpus()
     self.assertRaises(XigtStructureError, xc.append, Item())
     self.assertRaises(XigtStructureError, xc.append, Tier())
     self.assertRaises(XigtStructureError, xc.append, XigtCorpus())
     self.assertRaises(XigtStructureError, xc.append, Metadata())
     self.assertRaises(XigtStructureError, xc.append, Meta())
     self.assertEqual(len(xc), 0)
     xc.append(Igt(id='i1'))
     self.assertEqual(len(xc), 1)
     self.assertRaises(XigtError, xc.append, Igt(id='i1'))
     xc.append(Igt(id='i2'))
     self.assertEqual(len(xc), 2)
     self.assertEqual(xc[0].id, 'i1')
     self.assertEqual(xc[1].id, 'i2')
예제 #3
0
def eval_classifier(c, inst_list, context_feats=False, posdict=None):
    """

    :param c: The classifier
    :param inst_list: A list of Igt instances to test against. Must already have POS tags.
    """

    gold_sents = []
    eval_sents = []

    to_dump = XigtCorpus()

    for inst in inst_list:

        to_tag = inst.copy()
        strip_pos(to_tag)

        # Do the classification.
        to_tag.classify_gloss_pos(c, lowercase=True,
                                  feat_next_gram=context_feats,
                                  feat_prev_gram=context_feats,
                                  posdict=posdict)


        to_dump.append(to_tag)
        # Fix the tags...
        # fix_ctn_gloss_line(to_tag, tag_method=INTENT_POS_CLASS)

        # Now, retrieve eval/gold.
        eval_tags = [v.value() for v in to_tag.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)]
        gold_tags = [v.value() for v in inst.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_MANUAL)]


        tag_tokens = [POSToken('a', label=l) for l in eval_tags]
        gold_tokens= [POSToken('a', label=l) for l in gold_tags]

        if not len(tag_tokens) == len(gold_tokens):
            print("LENGTH OF SEQUENCE IS MISMATCHED")
            continue

        gold_sents.append(gold_tokens)
        eval_sents.append(tag_tokens)


    xigtxml.dump(open('./enriched_ctn_dev.xml', 'w'), to_dump)
    return poseval(eval_sents, gold_sents, details=True,csv=True, matrix=True)
예제 #4
0
파일: filter.py 프로젝트: rgeorgi/intent
def do_filter(filelist, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0):
    new_corp = XigtCorpus()

    FILTER_LOG.log(NORM_LEVEL, "Beginning filtering...")

    successes = 0
    failures  = 0
    examined  = 0

    for path in filelist:
        FILTER_LOG.log(1000, 'Opening file "{}" for filtering.'.format(os.path.basename(path)))
        xc = xc_load(path, mode=INCREMENTAL)
        instances, iter_examined, iter_success, iter_failures = filter_xc(xc, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances, successes)
        for instance in instances:
            new_corp.append(instance)

        successes += iter_success
        failures  += iter_failures
        examined  += iter_examined

    return new_corp, examined, failures, successes
예제 #5
0
파일: filter.py 프로젝트: rgeorgi/intent
def filter_xc(xc, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0, prev_good_instances=0):

    new_corp = XigtCorpus()

    examined = 0
    failures = 0
    successes= 0

    my_filter = ''

    for inst in xc:
        examined += 1
        assert isinstance(inst, Igt)

        def fail(reason):
            nonlocal failures, my_filter
            my_filter = filter_string(inst).format("FAIL", '['+reason+']')
            failures += 1
            FILTER_LOG.info(my_filter)

        def success():
            nonlocal successes, my_filter
            my_filter = filter_string(inst).format("SUCCESS", "")
            successes += 1


        def trytier(f):
            try:
                result = f(inst)
            except (NoNormLineException) as nnle:
                return None
                fail("Bad Lines")
            else:
                return result


        lt = trytier(lang)
        gt = trytier(gloss)
        tt = trytier(trans)


        if require_lang  and lt is None:
            fail("LANG")
            continue
        if require_gloss and gt is None:
            fail("GLOSS")
            continue
        if require_trans and tt is None:
            fail("TRANS")
            continue
        if require_aln:

            if gt is None:
                fail("ALIGN-GLOSS")
                continue
            if lt is None:
                fail("ALIGN-LANG")
                continue

            try:
                word_align(gt, lt)
            except GlossLangAlignException:
                fail("ALIGN")
                continue

        if require_grammatical:
            if lt:
                grammatical_ll = [l for l in lang_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)]
            if gt:
                grammatical_gl = gloss_line(inst).get_attribute(ODIN_JUDGMENT_ATTRIBUTE)
            if tt:
                grammatical_tl = [l for l in trans_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)]

            if grammatical_ll or grammatical_gl or grammatical_tl:
                fail("UNGRAMMATICAL")
                continue



        if require_gloss_pos:
            if pos_tag_tier(inst, gt.id) is None:
                fail("GLOSS_POS")
                continue

        # Otherwise, attach to the new corpus.
        new_corp.append(inst)

        success()
        FILTER_LOG.info(my_filter)
        inst.sort_tiers()

        # -------------------------------------------
        # Break out of the loop if we've hit the maximum
        # number of good instances.
        # -------------------------------------------
        if max_instances and prev_good_instances+successes >= max_instances:
            break

    return new_corp, examined, successes, failures
예제 #6
0
파일: naacl.py 프로젝트: rgeorgi/intent
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
예제 #7
0
파일: test_model.py 프로젝트: xigt/xigt
 def test_append(self):
     xc = XigtCorpus()
     with pytest.raises(XigtStructureError): xc.append(Item())
     with pytest.raises(XigtStructureError): xc.append(Tier())
     with pytest.raises(XigtStructureError): xc.append(XigtCorpus())
     with pytest.raises(XigtStructureError): xc.append(Metadata())
     with pytest.raises(XigtStructureError): xc.append(Meta())
     assert len(xc) == 0
     xc.append(Igt(id='i1'))
     assert len(xc) == 1
     with pytest.raises(XigtError): xc.append(Igt(id='i1'))
     xc.append(Igt(id='i2'))
     assert len(xc) == 2
     assert xc[0].id == 'i1'
     assert xc[1].id == 'i2'