Exemplo n.º 1
0
    def test_ds_cycle(self):
        """
        The tree in the ds_cycle file has "woman" depend both
        on "arriving" and "browse."
        """
        xc = xc_load(ds_cycle)
        inst = xc[0]

        #  1    2       4        5       7    8    9
        # The woman, (after) arriving, began to browse.

        # (The commas count as words, hence the skipping)

        tgt_t = DepTree.fromstring("""
        (ROOT[0]
            (began[7]
                (woman[2]
                    (The[1])
                    (\(after\)[4] (arriving[5])))
                (browse[9]
                    (woman[2])
                    (to[8])
                )
            ))
        """, stype=DEPSTR_PTB)

        ds = get_ds(inst, trans(inst))
        self.assertTrue(tgt_t.structurally_eq(ds))

        self.assertIsNone(project_ds_tier(inst))
Exemplo n.º 2
0
    def set_bilingual_align_test(self):
        """
        Set the bilingual alignment manually, and ensure that it is read back correctly.
        """

        a = Alignment([(1,1),(1,2),(2,8),(4,3),(5,7),(6,5)])
        set_bilingual_alignment(self.igt, trans(self.igt), glosses(self.igt), a, INTENT_ALN_MANUAL)
        get_trans_glosses_alignment(self.igt, INTENT_ALN_MANUAL)
Exemplo n.º 3
0
    def test_read_proj_ds_tree(self):
        src_t = get_ds(self.inst2, trans(self.inst2))
        tgt_w = lang(self.inst2)
        aln   = get_trans_gloss_alignment(self.inst2)

        tgt_t = DepTree.fromstring("""
        (ROOT[0]
            (glaubst[2]
                (Was[1])
                (Du[3])
                (wer[4])
                (angerufen[5] (hat[6]))
            ))
        """, stype=DEPSTR_PTB)

        proj_t = project_ds(src_t, tgt_w, aln)

        self.assertTrue(proj_t.structurally_eq(tgt_t))
Exemplo n.º 4
0
def extract_sents_from_inst(inst: Igt, out_src, out_tgt, aln_method=None, no_alignment_heur = True, sent_type=SENT_TYPE_T_G):
    """
    Extract parallel sentences from an instance. Either:

    1) Translation--Gloss
    2) Translation--Language
    """

    # -------------------------------------------
    # 1) Get the source string (translation)
    # -------------------------------------------
    src_str = tier_text(trans(inst), remove_whitespace_inside_tokens=True).lower()


    # -------------------------------------------
    # 2) Decide whether the target string is gloss or language.
    # -------------------------------------------
    if sent_type == SENT_TYPE_T_L:
        tgt_str = tier_text(lang(inst), remove_whitespace_inside_tokens=True).lower()
    elif sent_type == SENT_TYPE_T_G:
        tgt_str = tier_text(gloss(inst), remove_whitespace_inside_tokens=True).lower()
    else:
        raise Exception("Invalid sent type")

    # -------------------------------------------
    # 3) Write the choice out to disk.
    # -------------------------------------------
    out_src.write(src_str + '\n')
    out_tgt.write(tgt_str + '\n')
    out_src.flush()
    out_tgt.flush()

    # -------------------------------------------
    # 4) Add heuristic alignments, if asked for.
    # -------------------------------------------
    if not no_alignment_heur:

        pairs = get_trans_aligned_wordpairs(inst, aln_method=aln_method, add_align=True, sent_type=sent_type)
        for src_word, tgt_word in pairs:
            out_src.write(src_word.lower() + '\n')
            out_tgt.write(tgt_word.lower() + '\n')
Exemplo n.º 5
0
def enrich(**kwargs):

    global classifier

    if ARG_OUTFILE not in kwargs:
        ENRICH_LOG.critical("No output file specified.")
        sys.exit()

    # =============================================================================
    # Set up the alternate classifier path...
    # =============================================================================

    class_path = kwargs.get('class_path')

    #===========================================================================
    # Set up the different arguments...
    #===========================================================================
    inpath = kwargs.get(ARG_INFILE)

    parse_args = kwargs.get(PARSE_VAR, [])
    pos_args = kwargs.get(POS_VAR, [])
    aln_args = kwargs.get(ALN_VAR, [])

    max_parse_length = kwargs.get('max_parse_length', 10)

    if not (parse_args or pos_args or aln_args):
        ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.")

    #===========================================================================
    # Sanity check the arguments.
    #===========================================================================

    # Check that alignment is asked for if projection is asked for.
    if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args):
        ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \
                        "alignments to be generated. Projection may fail if alignment not already present in file.")

    ENRICH_LOG.log(1000, 'Loading input file...')
    with open(inpath, 'r', encoding='utf-8') as in_f:
        corp = xigtxml.load(in_f, mode=INCREMENTAL)

        # -------------------------------------------
        # Initialize the English tagger if:
        #   A) "proj" option is selected for pos.
        #   B) "trans" option is given for pos.
        #   C) "heurpos" option is given for alignment.
        # -------------------------------------------
        s = None
        if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, 'Initializing tagger...')
            tagger = c.getpath('stanford_tagger_trans')

            try:
                s = StanfordPOSTagger(tagger)
            except TaggerError as te:
                ENRICH_LOG.critical(te)
                sys.exit(2)

        # -------------------------------------------
        # Initialize the parser if:
        #    A) "trans" option is given for parse
        #    B) "proj" option is given for parse.
        # -------------------------------------------
        if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args:
            ENRICH_LOG.log(1000, "Intializing English parser...")
            sp = stanford_parser.StanfordParser()

        # -------------------------------------------
        # Initialize the classifier if:
        #    A) "class" option is given for pos
        #    B) "heurpos" option is given for alignment.
        # -------------------------------------------
        m = None
        if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, "Initializing gloss-line classifier...")
            p = load_posdict()
            m = mallet_maxent.MalletMaxent(classifier)


        # -- 1b) Giza Gloss to Translation alignment --------------------------------------
        if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args:
            ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...')

            try:
                if ARG_ALN_GIZAHEUR in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
                if ARG_ALN_GIZA in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
            except GizaAlignmentException as gae:
                gl = logging.getLogger('giza')
                gl.critical(str(gae))
                raise gae

        # -------------------------------------------
        # Begin iterating through the corpus
        # -------------------------------------------

        for inst in corp:

            feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id)

            reasons = []
            inst_status = None

            def fail(reason):
                nonlocal inst_status, reasons
                if reason not in reasons:
                    reasons.append(reason)
                inst_status = 'WARN'

            def success():
                nonlocal inst_status
                inst_status = 'OK'

            # -------------------------------------------
            # Define the reasons for failure
            # -------------------------------------------
            F_GLOSS_LINE = "NOGLOSS"
            F_LANG_LINE  = "NOLANG"
            F_TRANS_LINE = "NOTRANS"
            F_BAD_LINES  = "BADLINES"
            F_L_G_ALN    = "L_G_ALIGN"
            F_T_G_ALN    = "G_T_ALIGN"
            F_NO_TRANS_POS="NO_POS_TRANS"
            F_PROJECTION = "PROJECTION"
            F_UNKNOWN    = "UNKNOWN"
            F_PARSELEN   = "OVER_MAX_LENGTH"


            try:

                # -------------------------------------------
                # Get the different lines
                # -------------------------------------------
                def tryline(func):
                    nonlocal inst
                    try:
                        return func(inst)
                    except NoNormLineException as nnle:
                        return None

                gl = tryline(gloss_line)
                tls = tryline(trans_lines)
                lls  = tryline(lang_lines)

                has_gl = gl is not None
                has_tl = tls is not None
                has_ll = lls is not None

                has_all = lambda: (has_gl and has_tl and has_ll)


                # -------------------------------------------
                # Translation Line
                # -------------------------------------------
                if has_tl:

                    if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:

                        try:
                            tag_trans_pos(inst, s)
                        except CriticalTaggerError as cte:
                            ENRICH_LOG.critical(str(cte))
                            sys.exit(2)

                    if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args:
                        if len(trans(inst)) <= max_parse_length:
                            parse_translation_line(inst, sp, pt=True, dt=True)
                        else:
                            fail(F_PARSELEN)

                # 4) POS tag the gloss line --------------------------------------------
                if has_gl:
                    if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
                        classify_gloss_pos(inst, m, posdict=p)

                # -------------------------------------------
                # Try getting alignments.
                # -------------------------------------------
                if has_gl and has_ll:
                    try:
                        add_gloss_lang_alignments(inst)
                    except GlossLangAlignException as glae:
                        fail(F_L_G_ALN)

                if has_gl and has_tl:
                    if ARG_ALN_HEURPOS in aln_args:
                        heur_align_inst(inst, use_pos=True)
                    if ARG_ALN_HEUR in aln_args:
                        heur_align_inst(inst, use_pos=False)

                # -------------------------------------------
                # Now, do the necessary projection tasks.
                # -------------------------------------------

                # Project the classifier tags...
                if has_ll and has_gl and ARG_POS_CLASS in pos_args:
                    try:
                        project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS)
                    except GlossLangAlignException:
                        fail(F_L_G_ALN)

                # -------------------------------------------
                # Do the trans-to-lang projection...
                # -------------------------------------------

                if has_all():
                    proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)]
                    aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method)
                    if not aln or len(aln) == 0:
                        fail(F_T_G_ALN)
                    else:
                        # -------------------------------------------
                        # POS Projection
                        # -------------------------------------------
                        if ARG_POS_PROJ in pos_args:
                            trans_tags = trans_tag_tier(inst)

                            if not trans_tags:
                                fail(F_NO_TRANS_POS)
                            else:
                                project_trans_pos_to_gloss(inst)
                                try:
                                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                                except GlossLangAlignException as glae:
                                    fail(F_L_G_ALN)

                        # -------------------------------------------
                        # Parse projection
                        # -------------------------------------------
                        if ARG_PARSE_PROJ in parse_args:
                            try:
                                project_pt_tier(inst, proj_aln_method=proj_aln_method)
                            except PhraseStructureProjectionException as pspe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)

                            try:
                                project_ds_tier(inst, proj_aln_method=proj_aln_method)
                            except ProjectionException as pe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)



                # Sort the tiers... ----------------------------------------------------
                inst.sort_tiers()

            except Exception as e:
                # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id))
                ENRICH_LOG.debug(e)
                # raise(e)
                fail(F_UNKNOWN)

            if not reasons:
                success()


            ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons)))

        ENRICH_LOG.log(1000, 'Writing output file...')

        if hasattr(kwargs.get(ARG_OUTFILE), 'write'):
            xigtxml.dump(kwargs.get(ARG_OUTFILE), corp)
        else:
            xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp)

        ENRICH_LOG.log(1000, 'Done.')
        ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
Exemplo n.º 6
0
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
Exemplo n.º 7
0
 def line_test(self):
     """
     Test that lines are rendered correctly.
     """
     self.assertEqual(tier_text(gloss(self.igt)), 'I-Nom child-Dat rice-Acc eat-Caus-Pst-Dec')
     self.assertEqual(tier_text(trans(self.igt)), 'I made the child eat rice')
Exemplo n.º 8
0
def convert_pml(aln_path, out_path, hindi=True):

    if hindi:
        igt_data = retrieve_hindi()
    else:
        igt_data = retrieve_naacl()

    a_root = load_xml(aln_path)
    doc_a  = a_root.find(".//reffile[@name='document_a']").get('href')
    doc_b  = a_root.find(".//reffile[@name='document_b']").get('href')



    doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a))
    doc_b  = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b))

    # Load the sentences for each document.
    a_sents, a_glossed = load_sents(doc_a)
    b_sents, b_glossed = load_sents(doc_b)



    sent_alignments = a_root.findall(".//body/LM")

    assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses"

    xc = XigtCorpus()

    for sent_alignment in sent_alignments:

        # Get the sentence id...
        aln_id = sent_alignment.attrib.get('id')
        a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1)
        if a_snt_id not in igt_data:
            continue

        # Get the text and tokens from the naacl data.
        pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id]
        lang_tokens = lang_txt.split()
        gloss_tokens = gloss_txt.split()
        trans_tokens = trans_txt.split()

        a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1]
        b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1]

        word_alignments = sent_alignment.findall('./node_alignments/LM')

        a_snt, a_edges = a_sents[a_snt_ref]
        b_snt, b_edges = b_sents[b_snt_ref]

        assert isinstance(a_snt, Sentence)
        assert isinstance(b_snt, Sentence)
        # -------------------------------------------
        # Skip sentences if they are not found for whatever reason
        # -------------------------------------------
        if not a_snt or not b_snt:
            continue

        # -------------------------------------------
        # Start constructing the IGT Instance.
        # -------------------------------------------

        trans_snt, trans_indices = a_snt, a_edges
        gloss_snt, gloss_indices = b_snt, b_edges
        if a_glossed:
            trans_snt, trans_indices = b_snt, b_edges
            gloss_snt, gloss_indices = a_snt, a_edges

        # Hindi stuff...
        if hindi:
            lang_tokens = [w.text for w in gloss_snt]
            lang_postags   = [w.pos  for w in gloss_snt]
            lang_txt    = ' '.join(lang_tokens)

            trans_tokens = [w.text for w in trans_snt]
            trans_postags   = [w.pos  for w in trans_snt]
            trans_txt    = ' '.join(trans_tokens)

            gloss_tokens  = [w.gloss if w.gloss else 'NULL' for w in gloss_snt]
            gloss_postags = lang_postags
            gloss_txt     = ' '.join(gloss_tokens)



        inst = Igt(id=re.sub('s-', 'igt', a_snt_ref))
        nt   = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE})
        ll   = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt)
        gl   = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt)
        tl   = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt)
        nt.extend([ll,gl,tl])
        inst.append(nt)


        # -------------------------------------------
        # Handle the phrase tiers
        # -------------------------------------------
        generate_lang_phrase_tier(inst)
        generate_trans_phrase_tier(inst)

        def process_postags(sent, tokens):
            postags = []
            for i, token in enumerate(tokens):
                word = sent.getorder(i+1)
                if word is None:
                    postags.append(None)
                else:
                    postags.append(word.pos)
            return postags

        # -------------------------------------------
        # Now, handle the translation words.
        # -------------------------------------------
        tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0])
        inst.append(tt)

        if not hindi:
            trans_postags = process_postags(trans_snt, trans_tokens)

        add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL)


        # -------------------------------------------
        # Handle the words tiers...
        # -------------------------------------------
        wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0])
        gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl)
        inst.extend([wt, gwt])
        # Quickly set the alignment for the gloss words.
        for w, gw in zip(wt, gwt):
            gw.alignment = w.id


        if not hindi:
            lang_postags = process_postags(gloss_snt, gloss_tokens)
            gloss_postags = lang_postags

        add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL)
        add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL)

        create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL)
        create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL)



        # -------------------------------------------
        # Now, the word alignments.
        # -------------------------------------------
        a = Alignment()
        for word_alignment in word_alignments:
            a_ref = word_alignment.find('./a.rf').text.split('#')[1]
            b_ref = word_alignment.find('./b.rf').text.split('#')[1]

            a_word = a_snt.getid(a_ref)
            b_word = b_snt.getid(b_ref)

            if a_word is None or b_word is None:
                continue

            if not hindi:
                a_idx  = a_word.order
                b_idx  = b_word.order
            else:
                a_idx  = a_snt.index(a_word)+1
                b_idx  = b_snt.index(b_word)+1

            # Make sure the gloss is in the
            if a_glossed:
                trans_idx = b_idx
                lang_idx  = a_idx
            else:
                trans_idx = a_idx
                lang_idx  = b_idx

            a.add((trans_idx, lang_idx))


        set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL)
        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL)

        xc.append(inst)

    with open(out_path, 'w', encoding='utf-8') as f:
        xigtxml.dump(f, xc)
Exemplo n.º 9
0
    def test_read_ds_tree(self):
        ds = get_ds(self.inst1, trans(self.inst1))
        r = DepTree.fromstring("""(ROOT[0] (found[2] (Someone[1]) (them[3]) (boring[4])))""", stype=DEPSTR_PTB)

        self.assertTrue(r.structurally_eq(ds))