def character_align(word_string, subword_string, skip_re='[\-\s]'): """ Given a word-level line and a subword-level line that are character aligned, (with the exception of the characters to skip), link the subword-level string to the words of the word-level string. """ p = Phrase() word_strings = word_string.split() cur_subwords = [] word_idx = 0 # Iterate through all the whitespace-separated subwords. subwords = [SubWord(sw) for sw in re.split('[\s\-]+', subword_string)] for subword in subwords: if not re.search(re.sub(skip_re, '', subword.string), word_strings[word_idx], flags=re.IGNORECASE): w = Word(subwords=cur_subwords) cur_subwords = [] p.add_word(w) word_idx += 1 cur_subwords.append(subword) print(p.hyphenated) print([w.subwords for w in p])
def create_phrase_from_words_tier(tier: xigt.model.Tier, id_to_object_mapping: dict, segment_id_base: str = None, WordType=Word): """ Given a tier without pre-provided segmentation, return a phrase :param tier: :param id_to_object_mapping: :param WordType: :return: """ def word_func(xigt_word_item): """:type xigt_word_item: Item""" if segment_id_base: w = WordType(subwords=word_str_to_subwords(xigt_word_item.value()), id_=xigt_word_item.id) else: w = WordType(xigt_word_item.value(), id_=xigt_word_item.id) id_to_object_mapping[xigt_word_item.id] = w return w # give id strings to the newly created # subwords. p = Phrase([word_func(xw) for xw in tier]) if segment_id_base: assert segment_id_base is not None for i, sw in enumerate(p.subwords): sw.id = item_id(segment_id_base, i + 1) id_to_object_mapping[sw.id] = sw return p
def read_xigt(xigt_path): """ Convert Xigt to INTENT internal structure. :param xigt_path: :return: """ with open(xigt_path, 'r', encoding='utf-8') as xigt_f: corp = xigt.codecs.xigtxml.load(xigt_f) inst = corp[0] # Create the representation of the # language morphemes and words. l = create_lang_phrase(inst) # Next, the phrase representation of the translation translation = xigt_find(inst, id='t')[0].value() t = Phrase([Word(s) for s in translation.split()]) # Finally, create the glosses. # Assume that they have alignment to g = create_gloss_phrase(inst) print(l) print(g) print(t) for gw, lw in zip(g, l): gw.add_alignment(lw) print(gw.alignments)
def create_lang_phrase(inst) -> Phrase: """ :type inst: Igt :return: """ # Start with morphemes. morphs = xigt_find(inst, id='m') p = Phrase() cur_sws = [] last_word = None def add_word(): nonlocal cur_sws p.add_word(Word(subwords=cur_sws)) cur_sws = [] for morph in morphs: if last_word is not None and morph.segmentation != last_word: add_word() sw = SubWord(morph.text) cur_sws.append(sw) last_word = morph.segmentation add_word() return p
def parse_trans_tier(inst, id_to_object_mapping): """ Parse the translation tier :type id_to_object_mapping: dict :type inst: xigt.model.Igt """ trans_phrases_tier = xigt_find(inst, type='translations') # If there's a translations words tier, use that. trans_words_tier = xigt_find(inst, segmentation='t', type='words') # type: Tier if trans_words_tier: IMPORT_LOG.debug("trans-words tier found.") p = load_words(id_to_object_mapping, words_tier=trans_words_tier, WordType=TransWord) p.id = trans_words_tier.id return p # Otherwise, if there's neither a words tier nor a phrase tier elif not trans_phrases_tier: return None elif len(trans_phrases_tier) > 1: raise ImportException('NOT IMPLEMENTED: Multiple Translations!') elif trans_phrases_tier[0].value() is None: return None else: # Otherwise, tokenize the words on the translation tier and create a # new phrase. trans_phrase = Phrase(id_='tw') trans_tier_str = trans_phrases_tier[0].value() IMPORT_LOG.debug( 'No trans-word tier found for instance "{}", tokenizing trans phrase: "{}"' .format(inst.id, trans_tier_str)) for i, word in enumerate(word_tokenize(trans_tier_str)): tw = TransWord(word, id_=item_id('tw', i + 1)) id_to_object_mapping[tw.id] = tw trans_phrase.append(tw) return trans_phrase
def parse_odin(xigt_inst, tag, WordType, word_id_base, subword_id_base): """ Look for the normalized ODIN tier of the given tag type, and convert it to words/subwords objects. :param xigt_inst: The Xigt instance being parsed. :type xigt_inst: xigt.model.Igt :param tag: The L/G/T tag for which an ODIN line is being searched for. :param WordType: What word class to use to wrap the word objects. :param word_id_base: What ID string to use for the word tokens' prefixes. :param subword_id_base: What ID string to use for the subword tokens' prefixes, or None to not create subword tokens. """ normalized_tier = xigt_find(xigt_inst, type="odin", attributes={'state': 'normalized'}) if normalized_tier: normalized_line = xigt_find(normalized_tier, tag=tag.upper()) # type: xigt.model.Item # If a (non-blank) normalized line was found, tokenize it into a phrase. if normalized_line and normalized_line.value( ) and normalized_line.value().strip(): words = [] for word_string in word_tokenize(normalized_line.value()): assert word_string.strip() # Only segment into subwords if the id_base is not None if subword_id_base is not None: subwords = word_str_to_subwords(word_string) words.append(WordType(subwords=subwords)) else: words.append(WordType(string=word_string)) # Create the phrase, and assign the tokens unique IDs. p = Phrase(words) assign_ids(p, word_id_base, subword_id_base) return p # Return an empty phrase if this normalized line doesn't exist. return Phrase()
def process_toolbox_instance(toolbox_instance): """ Instances from toolbox files look like the following: \ref 20030701TifolafengBapakAnde1_0003 \ELANBegin 9.371 \ELANEnd 15.509 \ELANParticipant Andirias Padafani \sound 20030701TifolafengBapakAnde1.wav 9.371 15.509 \t heelo maiye, ni ni'aduo hamintaahi,# ni yaa pun namei.# \m he- nil -o maiye ni ni- adua ha- mintaahi ni yaa pun namei \g 3.LOC- do.so -PNCT if 1PL.EXCL.AGT 1PL.EXCL.AL- master 3.PAT- pray.CPL 1PL.EXCL.AGT go field prepare.field \p pro- v.loc -asp conj pro pro- n pro- v.pat pro v.0 n v.0 \f when it is that time, we pray to the Lord and go work in the fields Given this format, attempt to parse into INTENT objects. :type toolbox_instance: str :return: """ # -- 1a) Define a method to retrieve lines from the instance preceded by the given linetype # (Noting that there can be multiple entries for the same linetype per instance) def get_toolbox_line(linetype: str) -> str: line_matches = re.findall('\\\\{}\s+(.*)\n'.format(linetype), toolbox_instance) return None if not line_matches else ' '.join(line_matches) # -- 1b) Get all the types of lines from the instance, if they exist. trans_string = get_toolbox_line('f') gloss_string = get_toolbox_line('g') morph_string = get_toolbox_line('m') lang_string = get_toolbox_line('t') # -- 1c) Abort if we do not have all four lines. if not (trans_string and gloss_string and morph_string and lang_string): return None # ------------------------------------------- # Parse. # ------------------------------------------- lang_p = Phrase() columnar_re = re.compile('\S+\s*(?=\S|$)') morph_line_concatenated = re.sub('\s*-\s*', '-', morph_string)
def create_gloss_phrase(inst) -> Phrase: glosses = xigt_find(inst, id='g') p = Phrase() cur_glosses = [] last_word = None def add_word(): nonlocal cur_glosses p.add_word(Word(subwords=cur_glosses)) cur_glosses = [] for gloss in glosses: aligned_morph = xigt_find(inst, id=gloss.alignment) aligned_word = aligned_morph.segmentation if last_word is not None and aligned_word != last_word: add_word() sw = SubWord(gloss.text) cur_glosses.append(sw) last_word = aligned_word add_word() return p
def parse_lang_tier(inst, id_to_object_mapping): """ Given a instance, find the specified word tier and return the words either made up of pre-provided segmentations from a tier that segments them, or try to create the segmentations with subword-level tokenization. :type inst: xigt.model.Igt :type words_id: str """ lang_tier = xigt_find(inst, id=LANG_WORD_ID) morph_tier = xigt_find(inst, segmentation=LANG_WORD_ID) if not lang_tier: return Phrase(id_=LANG_WORD_ID) p = load_words(id_to_object_mapping, words_tier=lang_tier, segmentation_tier=morph_tier, segment_id_base='m', WordType=LangWord) p.id = LANG_WORD_ID return p
def load_words(id_to_object_mapping, words_tier=None, segmentation_tier=None, alignment_tier=None, segment_id_base=None, WordType=Word): """ Given a words tier (tier) and tier that provides segmentation for that tier, but which may be None (segmentation_tier), return a phrase with those words/subwords. There are A. A words tier and segmentation tier exist - Use segmentation tier, check segmentation exists for all words. B. A words tier exists, but no segmentation tier - Segment the words tier. C. No words tier exists, a segmentation tier is aligned with a tier that has words. - Group the segments according to the aligned words. D. No words tier exists, no segmentation tier exists. - Return an empty phrase :type words_tier: xigt.model.Tier :type segmentation_tier: xigt.model.Tier :type alignment_tier: xigt.model.Tier :type id_to_object_mapping: dict :rtype: Phrase """ # -- C / D) No words tier exists... if not words_tier: if segmentation_tier: if not alignment_tier: raise SegmentationTierException( 'Attempt to create phrase from segmentation tier "{}" in instance "{}" with no word alignments.' .format(segmentation_tier.id, segmentation_tier.igt.id)) IMPORT_LOG.info( 'Creating words tier "{}" from combination of segmentation "{}" and aligned tier "{}"' .format(type_to_id(WordType), segmentation_tier.id, alignment_tier.id)) return create_phrase_from_segments_alignments( id_to_object_mapping, segmentation_tier, alignment_tier, WordType) else: return Phrase() # -- B) If there's not a segmentation tier, return the phrase # created by the words tier alone. elif not segmentation_tier: return create_phrase_from_words_tier(words_tier, id_to_object_mapping, segment_id_base=segment_id_base, WordType=WordType) # -- A) If there is both a words tier and segmentation tier, # use the segmentation provided by the segmentation tier. elif segmentation_tier and words_tier: words = [] # For each word in the tier, retrieve the portions of the word # that are given as segments prev_sw = None for xigt_word_item in words_tier: # type: xigt.model.Item morph_segments = [ morph for morph in segmentation_tier if xigt_word_item.id in segmenting_refs(morph) ] # If the segmentation tier is provided, # we assume that every word has some form # of segmentation. if not morph_segments: pass raise SegmentationTierException( 'Segmentation tier provided for instance "{}", but no segments for word "{}"' .format(words_tier.igt.id, xigt_word_item.id)) else: morphs = [] for xigt_subword in morph_segments: # type: xigt.model.Item sw = subword_str_to_subword(xigt_subword.value(), id_=xigt_subword.id) was_freefloating = handle_freefloating_hyphens( sw, prev_sw, segmentation_tier.id, segmentation_tier.igt.id, xigt_subword.id) if was_freefloating: continue id_to_object_mapping[xigt_subword.id] = sw if xigt_subword.alignment and id_to_object_mapping.get( xigt_subword.alignment): sw.add_alignment( id_to_object_mapping.get(xigt_subword.alignment)) morphs.append(sw) prev_sw = sw # Skip creating a word if it only consisted of an empty hyphen./ if morphs: w = WordType(subwords=morphs, id_=item_id(words_tier.id, len(words) + 1)) id_to_object_mapping[xigt_word_item.id] = w words.append(w) else: IMPORT_LOG.warning('Word "{}" was skipped because ') return Phrase(words) else: raise ImportException("Unable to create phrase.")
def create_phrase_from_segments_alignments(id_to_object_mapping, segmentation_tier: xigt.model.Tier, aligned_tier: xigt.model.Tier, WordType=Word): """ Create a phrase by using a segmentation tier and the word-level groupings provided by the tier with which it is aligned. This is useful in the case of glosses which align with morphemes, but are not given their own word-level groupings in the data. """ force_token_alignments(segmentation_tier) # -- 0) Keep a mapping of word-level groups, and # the subword items that they contain. word_to_segment_map = defaultdict(list) # -- 1) Iterate over the segmented objects, and # add them to the group map. prev_subword = None for segment_item in segmentation_tier: # type: xigt.model.Item # The segment item must be specified in some way. if segment_item.value() is None: raise ImportException('Item "{}" has no content'.format( segment_item.id)) subword_obj = subword_str_to_subword(segment_item.value(), id_=segment_item.id) # Handle freefloating hyphens was_freefloating = handle_freefloating_hyphens( subword_obj, prev_subword, segmentation_tier.id, segmentation_tier.igt.id, segment_item.id) if was_freefloating: continue # Enter the subword obj into the mapping dict. id_to_object_mapping[segment_item.id] = subword_obj # We assume that there are alignments for every segmentation object if not segment_item.alignment: raise ImportException( 'Item "{}" in tier "{}" aligned to tier "{}" does not specify alignment target.' .format(segment_item.id, segmentation_tier.alignment, segmentation_tier.id, segmentation_tier.igt.id)) elif not (id_to_object_mapping.get(segment_item.alignment)): raise ImportException( 'Item "{}" in tier "{}" for instance "{}" missing alignment target "{}"' .format(segment_item.id, segmentation_tier.id, segmentation_tier.igt.id, segment_item.alignment)) aligned_obj = id_to_object_mapping[ segment_item.alignment] # type: Union[SubWord, Word] subword_obj.add_alignment(aligned_obj) aligned_word = aligned_obj if isinstance(aligned_obj, Word) else aligned_obj.word word_to_segment_map[aligned_word].append(subword_obj) prev_subword = subword_obj # -- 2) Check that our group map contains the same number of groups as # there exist in the aligned tier. if len(aligned_tier) != len(word_to_segment_map): IMPORT_LOG.warning( 'Mismatch between number of word groups for segmentation tier "{}" and aligned tier "{}" in instance "{}"' .format(segmentation_tier.id, aligned_tier.id, aligned_tier.igt.id)) # -- 3) Now, create words based on the groupings provided by the # group map. word_groups = sorted(word_to_segment_map.keys(), key=lambda word: word.index) phrase = Phrase() for aligned_word in word_groups: # type: Word new_word = WordType(subwords=word_to_segment_map[aligned_word], id_=item_id(type_to_id(WordType), aligned_word.index + 1)) new_word.add_alignment(aligned_word) phrase.add_word(new_word) return phrase