예제 #1
0
def generate_phrase_tier(inst, tag, id, type) -> Tier:
    """
    Retrieve a phrase for the given tag, with the provided id and type.
    """

    f = lambda x: tag in odin_tags(x)
    pt = xigt_find(inst, type=type, others=[f])


    if pt is None:
        normal_tier = generate_normal_tier(inst)

        # Create the phrase tier
        pt = Tier(id=id, type=type, content=normal_tier.id)

        for normal_line in retrieve_normal_lines(inst, tag):

            # -------------------------------------------
            # Propagate the judgment attribute on the line to the phrase item
            # -------------------------------------------
            phrase_attributes = {}
            old_judgment = normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE)
            if normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) is not None:
                phrase_attributes[ODIN_JUDGMENT_ATTRIBUTE] = old_judgment

            # -------------------------------------------
            # Finally, create the phrase item, and
            # add it to the phrase tier.
            # -------------------------------------------
            pt.append(Item(id=ask_item_id(pt), content=normal_line.id, attributes=phrase_attributes))
        inst.append(pt)

    return pt
예제 #2
0
def words_to_morph_tier(tier, type, id, aln_attribute):
    """
    :param tier:
     :type tier: Tier

    :param type:
    :param id:
    :param aln_attribute:
    """

    mt = Tier(id=id, attributes={aln_attribute:tier.id}, type=type)

    # Go through each word...
    for word in tier:

        morphs = tokenize_item(word, morpheme_tokenizer)

        for morph in morphs:
            # If there is only one morph in the tokenization, don't bother with the indexing, just
            # use the id.
            if len(morphs) == 1:
                aln_str = word.id
            else:
                aln_str = create_aln_expr(word.id, morph.start, morph.stop)

            rm = Item(id=gen_item_id(mt.id, len(mt)),
                      attributes={aln_attribute: aln_str})
            mt.append(rm)

    return mt
예제 #3
0
def create_words_tier(cur_item, word_id, word_type, aln_attribute = SEGMENTATION, tokenizer=sentence_tokenizer):
    """
    Create a words tier from an ODIN line type item.

    :param cur_item: Either a phrase item or a line item to tokenize and create words form.
    :type cur_item: RGItem
    :param word_id: The ID for this tier.
    :type word_id: str
    :param word_type: Tier type for this tier.
    :type word_type: str

    :rtype: RGWordTier
    """

    # For the edge case in which the gloss line is defined, but empty.
    if cur_item.value() is None or not cur_item.value().strip():
        words = []
    else:
        # Tokenize the words in this phrase...
        words = tokenize_item(cur_item, tokenizer=tokenizer)

    # Create a new word tier to hold the tokenized words...
    wt = Tier(id = word_id, type=word_type, attributes={aln_attribute:cur_item.tier.id}, igt=cur_item.igt)

    for w in words:
        # Create a new word that is a segmentation of this tier.
        rw = Item(id=gen_item_id(wt.id, len(wt)),
                  attributes={aln_attribute:create_aln_expr(cur_item.id, w.start, w.stop)}, tier=wt)
        wt.append(rw)

    return wt
예제 #4
0
def create_word_tier(tag, words, src_item=None):
    """
    Given the word type, a list of the word strings, and optionally the source item
     that the words are drawn from. If no source item is given, the words will be
     generated with "text" for each item, otherwise it will segment the provided line.

    :type words: list[str]
    :type src_item: xigt.model.Item
    """

    if tag == ODIN_TRANS_TAG:
        wt = Tier(id=TRANS_WORD_ID, type=TRANS_WORD_TYPE, segmentation=TRANS_PHRASE_ID)
        aln_attr = SEGMENTATION
    elif tag == ODIN_GLOSS_TAG:
        wt = Tier(id=GLOSS_WORD_ID, type=GLOSS_WORD_TYPE, content=NORM_ID, alignment=LANG_WORD_ID)
        aln_attr = CONTENT
    elif tag == ODIN_LANG_TAG:
        wt = Tier(id=LANG_WORD_ID, type=LANG_WORD_TYPE, segmentation=LANG_PHRASE_ID)
        aln_attr = SEGMENTATION


    # If we are providing a item that we
    # want the words to be segmenting, set up
    # a text string that we'll incrementally trim
    # and a counter to keep track of the offset from start.
    src_text = None
    offset   = 0
    if src_item:
        src_text = src_item.value()

    for w in words:

        # Use the text from the source line to index
        # the words being added, rather than making them text.
        # use a sliding window, kind of like a "pop()" would
        if src_item:
            start = src_text.index(w)
            stop  = start+len(w)
            src_text = src_text[stop:]
            cur_range = (start+offset, stop+offset)
            offset += stop

        if src_item:
            i = Item(id=ask_item_id(wt), attributes={aln_attr:create_aln_expr(src_item.id, *cur_range)})
        else:
            i = Item(id=ask_item_id(wt), text=w)

        wt.append(i)

    if tag == ODIN_GLOSS_TAG:
        add_word_level_info(wt, INTENT_GLOSS_WORD)

    return wt
예제 #5
0
 def test_append(self):
     t = Tier()
     self.assertRaises(XigtStructureError, t.append, Tier())
     self.assertRaises(XigtStructureError, t.append, Igt())
     self.assertRaises(XigtStructureError, t.append, XigtCorpus())
     self.assertRaises(XigtStructureError, t.append, Metadata())
     self.assertRaises(XigtStructureError, t.append, Meta())
     self.assertEqual(len(t), 0)
     t.append(Item(id='t1'))
     self.assertEqual(len(t), 1)
     self.assertRaises(XigtError, t.append, Item(id='t1'))
     t.append(Item(id='t2'))
     self.assertEqual(len(t), 2)
     self.assertEqual(t[0].id, 't1')
     self.assertEqual(t[1].id, 't2')
예제 #6
0
def create_text_tier_from_lines(inst, lines, id_base, state):
    """
    Given a list of lines that are dicts with the attributes 'text' and 'tag', create
    a text tier of the specified type with the provided line items.

    :type lines: list[dict]
    """
    # -------------------------------------------
    # 1) Generate the parent tier.
    tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state})


    # -------------------------------------------
    # 2) Iterate over the list of lines
    for line in lines:

        # Make sure the line is a dict.
        if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line:
            raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.")

        # Construct the list of tags.
        alltags = []
        if line.get('tag') is not None:
            alltags.append(line.get('tag'))
        if line.get('labels') is not None and line.get('labels'):
            alltags.append(line.get('labels'))
        tag_str = '+'.join(alltags)


        # Construct the attributes
        line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str}
        if line.get('judgment') is not None:
            line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment']

        # Add the linenumber
        if line.get('lineno'):
            line_attributes['line'] = line.get('lineno', '')


        l = Item(id=gen_item_id(tier.id, len(tier)),
                   attributes=line_attributes,
                   text=line.get('text'))
        tier.append(l)
    return tier
예제 #7
0
    def test_resolve_ref(self):
        # item has no reference attribute
        b1 = Item(id='b1')
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # has a reference attribute, but is not contained by a tier
        b1.alignment = 'a1'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in tier, but tier has no reference attribute
        t_b = Tier(id='b', items=[b1])
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # tier has reference attribute, but is not contained by an Igt
        t_b.alignment = 'a'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in IGT, but referred tier doesn't exist
        igt = Igt(tiers=[t_b])
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred tier exists, but has no item referred by item's alignment
        t_a = Tier(id='a')
        igt.append(t_a)
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred item exists, but has no value (which resolves to '')
        a1 = Item(id='a1')
        t_a.append(a1)
        self.assertEqual(b1.resolve_ref('alignment'), '')
        # referred item has a value
        a1.text = 'text'
        self.assertEqual(b1.resolve_ref('alignment'), 'text')

        # stored item tests
        self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment')

        self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment')

        self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text')
        self.assertEqual(self.i_ac.resolve_ref('content'), 'te')

        self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt')

        self.assertEqual(self.i_t.resolve_ref('content'), 'text')
예제 #8
0
파일: naacl.py 프로젝트: rgeorgi/intent
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
예제 #9
0
파일: test_model.py 프로젝트: xigt/xigt
 def test_append(self):
     t = Tier()
     with pytest.raises(XigtStructureError): t.append(Tier())
     with pytest.raises(XigtStructureError): t.append(Igt())
     with pytest.raises(XigtStructureError): t.append(XigtCorpus())
     with pytest.raises(XigtStructureError): t.append(Metadata())
     with pytest.raises(XigtStructureError): t.append(Meta())
     assert len(t) == 0
     t.append(Item(id='t1'))
     assert len(t) == 1
     with pytest.raises(XigtError): t.append(Item(id='t1'))
     t.append(Item(id='t2'))
     assert len(t) == 2
     assert t[0].id == 't1'
     assert t[1].id == 't2'