Exemplo n.º 1
0
def character_align(word_string, subword_string, skip_re='[\-\s]'):
    """
    Given a word-level line and a subword-level line that are character aligned,
    (with the exception of the characters to skip), link the subword-level string
    to the words of the word-level string.
    """
    p = Phrase()

    word_strings = word_string.split()
    cur_subwords = []
    word_idx = 0

    # Iterate through all the whitespace-separated subwords.
    subwords = [SubWord(sw) for sw in re.split('[\s\-]+', subword_string)]
    for subword in subwords:


        if not re.search(re.sub(skip_re, '', subword.string), word_strings[word_idx], flags=re.IGNORECASE):
            w = Word(subwords=cur_subwords)
            cur_subwords = []
            p.add_word(w)
            word_idx += 1

        cur_subwords.append(subword)


    print(p.hyphenated)
    print([w.subwords for w in p])
Exemplo n.º 2
0
def create_phrase_from_words_tier(tier: xigt.model.Tier,
                                  id_to_object_mapping: dict,
                                  segment_id_base: str = None,
                                  WordType=Word):
    """
    Given a tier without pre-provided segmentation, return a phrase

    :param tier:
    :param id_to_object_mapping:
    :param WordType:
    :return:
    """
    def word_func(xigt_word_item):
        """:type xigt_word_item: Item"""
        if segment_id_base:
            w = WordType(subwords=word_str_to_subwords(xigt_word_item.value()),
                         id_=xigt_word_item.id)
        else:
            w = WordType(xigt_word_item.value(), id_=xigt_word_item.id)
        id_to_object_mapping[xigt_word_item.id] = w
        return w

    # give id strings to the newly created
    # subwords.
    p = Phrase([word_func(xw) for xw in tier])
    if segment_id_base:
        assert segment_id_base is not None
        for i, sw in enumerate(p.subwords):
            sw.id = item_id(segment_id_base, i + 1)
            id_to_object_mapping[sw.id] = sw

    return p
Exemplo n.º 3
0
def read_xigt(xigt_path):
    """
    Convert Xigt to INTENT internal structure.
    :param xigt_path:
    :return:
    """
    with open(xigt_path, 'r', encoding='utf-8') as xigt_f:
        corp = xigt.codecs.xigtxml.load(xigt_f)
        inst = corp[0]

        # Create the representation of the
        # language morphemes and words.
        l = create_lang_phrase(inst)

        # Next, the phrase representation of the translation
        translation = xigt_find(inst, id='t')[0].value()
        t = Phrase([Word(s) for s in translation.split()])

        # Finally, create the glosses.
        # Assume that they have alignment to
        g = create_gloss_phrase(inst)

        print(l)
        print(g)
        print(t)

        for gw, lw in zip(g, l):
            gw.add_alignment(lw)
            print(gw.alignments)
Exemplo n.º 4
0
def create_lang_phrase(inst) -> Phrase:
    """
    :type inst: Igt
    :return:
    """
    # Start with morphemes.
    morphs = xigt_find(inst, id='m')

    p = Phrase()
    cur_sws = []
    last_word = None

    def add_word():
        nonlocal cur_sws
        p.add_word(Word(subwords=cur_sws))
        cur_sws = []

    for morph in morphs:
        if last_word is not None and morph.segmentation != last_word:
            add_word()
        sw = SubWord(morph.text)
        cur_sws.append(sw)
        last_word = morph.segmentation

    add_word()
    return p
Exemplo n.º 5
0
def parse_trans_tier(inst, id_to_object_mapping):
    """
    Parse the translation tier

    :type id_to_object_mapping: dict
    :type inst: xigt.model.Igt
    """
    trans_phrases_tier = xigt_find(inst, type='translations')

    # If there's a translations words tier, use that.
    trans_words_tier = xigt_find(inst, segmentation='t',
                                 type='words')  # type: Tier
    if trans_words_tier:
        IMPORT_LOG.debug("trans-words tier found.")
        p = load_words(id_to_object_mapping,
                       words_tier=trans_words_tier,
                       WordType=TransWord)
        p.id = trans_words_tier.id
        return p

    # Otherwise, if there's neither a words tier nor a phrase tier
    elif not trans_phrases_tier:
        return None
    elif len(trans_phrases_tier) > 1:
        raise ImportException('NOT IMPLEMENTED: Multiple Translations!')
    elif trans_phrases_tier[0].value() is None:
        return None
    else:
        # Otherwise, tokenize the words on the translation tier and create a
        # new phrase.

        trans_phrase = Phrase(id_='tw')
        trans_tier_str = trans_phrases_tier[0].value()
        IMPORT_LOG.debug(
            'No trans-word tier found for instance "{}", tokenizing trans phrase: "{}"'
            .format(inst.id, trans_tier_str))
        for i, word in enumerate(word_tokenize(trans_tier_str)):
            tw = TransWord(word, id_=item_id('tw', i + 1))
            id_to_object_mapping[tw.id] = tw
            trans_phrase.append(tw)

        return trans_phrase
Exemplo n.º 6
0
def parse_odin(xigt_inst, tag, WordType, word_id_base, subword_id_base):
    """
    Look for the normalized ODIN tier of the given
    tag type, and convert it to words/subwords objects.

    :param xigt_inst: The Xigt instance being parsed.
    :type xigt_inst: xigt.model.Igt
    :param tag: The L/G/T tag for which an ODIN line is being searched for.
    :param WordType: What word class to use to wrap the word objects.
    :param word_id_base: What ID string to use for the word tokens' prefixes.
    :param subword_id_base: What ID string to use for the subword tokens' prefixes, or None to not create subword tokens.
    """
    normalized_tier = xigt_find(xigt_inst,
                                type="odin",
                                attributes={'state': 'normalized'})
    if normalized_tier:
        normalized_line = xigt_find(normalized_tier,
                                    tag=tag.upper())  # type: xigt.model.Item

        # If a (non-blank) normalized line was found, tokenize it into a phrase.
        if normalized_line and normalized_line.value(
        ) and normalized_line.value().strip():
            words = []
            for word_string in word_tokenize(normalized_line.value()):

                assert word_string.strip()

                # Only segment into subwords if the id_base is not None
                if subword_id_base is not None:
                    subwords = word_str_to_subwords(word_string)
                    words.append(WordType(subwords=subwords))
                else:
                    words.append(WordType(string=word_string))

            # Create the phrase, and assign the tokens unique IDs.
            p = Phrase(words)
            assign_ids(p, word_id_base, subword_id_base)
            return p

    # Return an empty phrase if this normalized line doesn't exist.
    return Phrase()
Exemplo n.º 7
0
def process_toolbox_instance(toolbox_instance):
    """
    Instances from toolbox files look like the following:

        \ref 20030701TifolafengBapakAnde1_0003
        \ELANBegin 9.371
        \ELANEnd 15.509
        \ELANParticipant Andirias Padafani
        \sound 20030701TifolafengBapakAnde1.wav 9.371 15.509
        \t heelo              maiye, ni           ni'aduo             hamintaahi,#    ni           yaa pun   namei.#
        \m he-    nil   -o    maiye  ni           ni-          adua   ha-    mintaahi ni           yaa pun   namei
        \g 3.LOC- do.so -PNCT if     1PL.EXCL.AGT 1PL.EXCL.AL- master 3.PAT- pray.CPL 1PL.EXCL.AGT go  field prepare.field
        \p pro-   v.loc -asp  conj   pro          pro-         n      pro-   v.pat    pro          v.0 n     v.0

        \f when it is that time, we pray to the Lord and go work in the fields

    Given this format, attempt to parse into INTENT objects.

    :type toolbox_instance: str
    :return:
    """

    # -- 1a) Define a method to retrieve lines from the instance preceded by the given linetype
    #        (Noting that there can be multiple entries for the same linetype per instance)
    def get_toolbox_line(linetype: str) -> str:
        line_matches = re.findall('\\\\{}\s+(.*)\n'.format(linetype),
                                  toolbox_instance)
        return None if not line_matches else ' '.join(line_matches)

    # -- 1b) Get all the types of lines from the instance, if they exist.
    trans_string = get_toolbox_line('f')
    gloss_string = get_toolbox_line('g')
    morph_string = get_toolbox_line('m')
    lang_string = get_toolbox_line('t')

    # -- 1c) Abort if we do not have all four lines.
    if not (trans_string and gloss_string and morph_string and lang_string):
        return None

    # -------------------------------------------
    # Parse.
    # -------------------------------------------
    lang_p = Phrase()

    columnar_re = re.compile('\S+\s*(?=\S|$)')

    morph_line_concatenated = re.sub('\s*-\s*', '-', morph_string)
Exemplo n.º 8
0
def create_gloss_phrase(inst) -> Phrase:
    glosses = xigt_find(inst, id='g')

    p = Phrase()
    cur_glosses = []
    last_word = None

    def add_word():
        nonlocal cur_glosses
        p.add_word(Word(subwords=cur_glosses))
        cur_glosses = []

    for gloss in glosses:
        aligned_morph = xigt_find(inst, id=gloss.alignment)
        aligned_word = aligned_morph.segmentation
        if last_word is not None and aligned_word != last_word:
            add_word()
        sw = SubWord(gloss.text)
        cur_glosses.append(sw)
        last_word = aligned_word
    add_word()
    return p
Exemplo n.º 9
0
def parse_lang_tier(inst, id_to_object_mapping):
    """
    Given a instance, find the specified word tier
    and return the words either made up of pre-provided
    segmentations from a tier that segments them,
    or try to create the segmentations with subword-level
    tokenization.

    :type inst: xigt.model.Igt
    :type words_id: str
    """
    lang_tier = xigt_find(inst, id=LANG_WORD_ID)
    morph_tier = xigt_find(inst, segmentation=LANG_WORD_ID)

    if not lang_tier:
        return Phrase(id_=LANG_WORD_ID)

    p = load_words(id_to_object_mapping,
                   words_tier=lang_tier,
                   segmentation_tier=morph_tier,
                   segment_id_base='m',
                   WordType=LangWord)
    p.id = LANG_WORD_ID
    return p
Exemplo n.º 10
0
def load_words(id_to_object_mapping,
               words_tier=None,
               segmentation_tier=None,
               alignment_tier=None,
               segment_id_base=None,
               WordType=Word):
    """
    Given a words tier (tier) and tier that provides segmentation for that tier, but which
    may be None (segmentation_tier), return a phrase with those words/subwords.

    There are
        A. A words tier and segmentation tier exist
            - Use segmentation tier, check segmentation exists for all words.
        B. A words tier exists, but no segmentation tier
            - Segment the words tier.
        C. No words tier exists, a segmentation tier is aligned with a tier that has words.
            - Group the segments according to the aligned words.
        D. No words tier exists, no segmentation tier exists.
            - Return an empty phrase

    :type words_tier: xigt.model.Tier
    :type segmentation_tier: xigt.model.Tier
    :type alignment_tier: xigt.model.Tier
    :type id_to_object_mapping: dict
    :rtype: Phrase
    """

    # -- C / D) No words tier exists...
    if not words_tier:
        if segmentation_tier:
            if not alignment_tier:
                raise SegmentationTierException(
                    'Attempt to create phrase from segmentation tier "{}" in instance "{}" with no word alignments.'
                    .format(segmentation_tier.id, segmentation_tier.igt.id))
            IMPORT_LOG.info(
                'Creating words tier "{}" from combination of segmentation "{}" and aligned tier "{}"'
                .format(type_to_id(WordType), segmentation_tier.id,
                        alignment_tier.id))
            return create_phrase_from_segments_alignments(
                id_to_object_mapping, segmentation_tier, alignment_tier,
                WordType)
        else:
            return Phrase()

    # -- B) If there's not a segmentation tier, return the phrase
    #       created by the words tier alone.
    elif not segmentation_tier:
        return create_phrase_from_words_tier(words_tier,
                                             id_to_object_mapping,
                                             segment_id_base=segment_id_base,
                                             WordType=WordType)

    # -- A) If there is both a words tier and segmentation tier,
    #       use the segmentation provided by the segmentation tier.
    elif segmentation_tier and words_tier:
        words = []

        # For each word in the tier, retrieve the portions of the word
        # that are given as segments
        prev_sw = None
        for xigt_word_item in words_tier:  # type: xigt.model.Item

            morph_segments = [
                morph for morph in segmentation_tier
                if xigt_word_item.id in segmenting_refs(morph)
            ]

            # If the segmentation tier is provided,
            # we assume that every word has some form
            # of segmentation.
            if not morph_segments:
                pass
                raise SegmentationTierException(
                    'Segmentation tier provided for instance "{}", but no segments for word "{}"'
                    .format(words_tier.igt.id, xigt_word_item.id))
            else:
                morphs = []

                for xigt_subword in morph_segments:  # type: xigt.model.Item

                    sw = subword_str_to_subword(xigt_subword.value(),
                                                id_=xigt_subword.id)
                    was_freefloating = handle_freefloating_hyphens(
                        sw, prev_sw, segmentation_tier.id,
                        segmentation_tier.igt.id, xigt_subword.id)
                    if was_freefloating:
                        continue

                    id_to_object_mapping[xigt_subword.id] = sw

                    if xigt_subword.alignment and id_to_object_mapping.get(
                            xigt_subword.alignment):
                        sw.add_alignment(
                            id_to_object_mapping.get(xigt_subword.alignment))

                    morphs.append(sw)
                    prev_sw = sw

                # Skip creating a word if it only consisted of an empty hyphen./
                if morphs:
                    w = WordType(subwords=morphs,
                                 id_=item_id(words_tier.id,
                                             len(words) + 1))

                    id_to_object_mapping[xigt_word_item.id] = w
                    words.append(w)
                else:
                    IMPORT_LOG.warning('Word "{}" was skipped because ')

        return Phrase(words)
    else:
        raise ImportException("Unable to create phrase.")
Exemplo n.º 11
0
def create_phrase_from_segments_alignments(id_to_object_mapping,
                                           segmentation_tier: xigt.model.Tier,
                                           aligned_tier: xigt.model.Tier,
                                           WordType=Word):
    """
    Create a phrase by using a segmentation tier and the word-level groupings
    provided by the tier with which it is aligned.

    This is useful in the case of glosses which align with morphemes, but
    are not given their own word-level groupings in the data.
    """

    force_token_alignments(segmentation_tier)

    # -- 0) Keep a mapping of word-level groups, and
    #       the subword items that they contain.
    word_to_segment_map = defaultdict(list)

    # -- 1) Iterate over the segmented objects, and
    #       add them to the group map.
    prev_subword = None
    for segment_item in segmentation_tier:  # type: xigt.model.Item

        # The segment item must be specified in some way.
        if segment_item.value() is None:
            raise ImportException('Item "{}" has no content'.format(
                segment_item.id))

        subword_obj = subword_str_to_subword(segment_item.value(),
                                             id_=segment_item.id)

        # Handle freefloating hyphens
        was_freefloating = handle_freefloating_hyphens(
            subword_obj, prev_subword, segmentation_tier.id,
            segmentation_tier.igt.id, segment_item.id)
        if was_freefloating:
            continue

        # Enter the subword obj into the mapping dict.
        id_to_object_mapping[segment_item.id] = subword_obj

        # We assume that there are alignments for every segmentation object
        if not segment_item.alignment:
            raise ImportException(
                'Item "{}" in tier "{}" aligned to tier "{}" does not specify alignment target.'
                .format(segment_item.id, segmentation_tier.alignment,
                        segmentation_tier.id, segmentation_tier.igt.id))

        elif not (id_to_object_mapping.get(segment_item.alignment)):
            raise ImportException(
                'Item "{}" in tier "{}" for instance "{}" missing alignment target "{}"'
                .format(segment_item.id, segmentation_tier.id,
                        segmentation_tier.igt.id, segment_item.alignment))
        aligned_obj = id_to_object_mapping[
            segment_item.alignment]  # type: Union[SubWord, Word]
        subword_obj.add_alignment(aligned_obj)

        aligned_word = aligned_obj if isinstance(aligned_obj,
                                                 Word) else aligned_obj.word

        word_to_segment_map[aligned_word].append(subword_obj)
        prev_subword = subword_obj

    # -- 2) Check that our group map contains the same number of groups as
    #       there exist in the aligned tier.
    if len(aligned_tier) != len(word_to_segment_map):
        IMPORT_LOG.warning(
            'Mismatch between number of word groups for segmentation tier "{}" and aligned tier "{}" in instance "{}"'
            .format(segmentation_tier.id, aligned_tier.id,
                    aligned_tier.igt.id))

    # -- 3) Now, create words based on the groupings provided by the
    #       group map.
    word_groups = sorted(word_to_segment_map.keys(),
                         key=lambda word: word.index)
    phrase = Phrase()
    for aligned_word in word_groups:  # type: Word
        new_word = WordType(subwords=word_to_segment_map[aligned_word],
                            id_=item_id(type_to_id(WordType),
                                        aligned_word.index + 1))
        new_word.add_alignment(aligned_word)
        phrase.add_word(new_word)
    return phrase