Пример #1
0
    def same_type_different_tiers_test(self):
        pos1_id = gen_tier_id(self.i, 'pos', tier_type='pos', alignment='tw')
        self.assertEqual(pos1_id, 'tw-pos')

        self.i.append(Tier(id=pos1_id, type='pos', alignment='tw'))
        pos2_id = gen_tier_id(self.i, 'pos', tier_type='pos', alignment='tw')
        self.assertEqual(pos2_id, 'tw-pos_b')
Пример #2
0
def create_text_tier_from_lines(inst, lines, id_base, state):
    """
    Given a list of lines that are dicts with the attributes 'text' and 'tag', create
    a text tier of the specified type with the provided line items.

    :type lines: list[dict]
    """
    # -------------------------------------------
    # 1) Generate the parent tier.
    tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state})


    # -------------------------------------------
    # 2) Iterate over the list of lines
    for line in lines:

        # Make sure the line is a dict.
        if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line:
            raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.")

        # Construct the list of tags.
        alltags = []
        if line.get('tag') is not None:
            alltags.append(line.get('tag'))
        if line.get('labels') is not None and line.get('labels'):
            alltags.append(line.get('labels'))
        tag_str = '+'.join(alltags)


        # Construct the attributes
        line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str}
        if line.get('judgment') is not None:
            line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment']

        # Add the linenumber
        if line.get('lineno'):
            line_attributes['line'] = line.get('lineno', '')


        l = Item(id=gen_item_id(tier.id, len(tier)),
                   attributes=line_attributes,
                   text=line.get('text'))
        tier.append(l)
    return tier
Пример #3
0
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
Пример #4
0
 def unique_tier_test(self):
     tw_id = gen_tier_id(self.i, 'w', tier_type='words', alignment='t', no_hyphenate=True)
     print(tw_id)