예제 #1
0
def postprocess(word_as_list, fst):
    if fst.postprocess_req:
        if not fst.left_subseq:
            word_as_list = word_as_list[::-1]
            if fst.name == "Igbo ATR harmony":
                post_fst = FST(vh_dataset['22B'])
                output = post_fst.step(word_as_list)
                word_as_list = output[::-1]
            elif fst.name == "Diola-Fogny (Jola-Fonyi) ATR harmony":
                word_as_list = fst.step(word_as_list)
                word_as_list = word_as_list[::-1]
            elif fst.name == \
                    "Pasiego vowel harmony (metaphony, raising, and centralization)":
                word_as_list = word_as_list[::-1]
                fst_b = FST(vh_dataset['26B'])
                word_as_list = fst_b.step(word_as_list)
                fst_c = FST(vh_dataset['26C'])
                word_as_list = fst_c.step(word_as_list)[::-1]
            elif fst.name == "Kalenjin ATR harmony":
                # reversing this for lazy (but probably not efficient) parsing
                word_as_list = word_as_list[::-1]
                if '+' in word_as_list:
                    final_prefixes = word_as_list[word_as_list.index('+'
                                                                     ):][::-1]
                    stem = word_as_list[:word_as_list.index('+')][::-1]
                else:
                    final_prefixes = []
                    stem = word_as_list
                if '-' in stem:
                    stem = stem[:stem.index('-')]
                post_language = vh_dataset['24B']
                post_fst = FST(post_language)
                suffix_as_list = []
                [suffix_as_list.extend(suff) for suff in fst.suffix]
                processed_stem_and_suffix = post_fst.step(stem +
                                                          suffix_as_list)
                word_as_list = final_prefixes + processed_stem_and_suffix
        else:
            if fst.name == "Maasai (Eastern Nilotic) ATR harmony":
                word_as_list = word_as_list[::-1]
                fst_b = vh_dataset['28B']
                word_as_list = fst_b.step(word_as_list)[::-1]
            if fst.name == "Nawuri (North Guang) ATR harmony":
                fst_b = vh_dataset['31B']
                reversed = word_as_list[::-1]
                word_as_list = fst_b.step(reversed)[::-1]
    return "".join(word_as_list)
예제 #2
0
def preprocess(word_as_list, prefix_as_list, stem_as_list, suffix_as_list,
               fst):
    fst.prefix = prefix_as_list
    fst.suffix = suffix_as_list

    # empty input
    if word_as_list == []:
        return False, []

    # if preprocessing isn't required on the input string
    if not fst.preprocess_req:
        return True, word_as_list

    # Preprocessing for left-subsequential languages
    if fst.left_subseq:

        if fst.name == "Kisa applicative suffix Vlɑ"\
            or fst.name == "Kisa reversative suffix Vlɑ":
            return True, word_as_list[:-3]

        elif fst.name in {
                "Uyghur backness harmony", "Uyghur plural suffix -lVr",
                'Uyghur dative suffix -' + U_F_V + 'V'
        }:
            # run the preprocess step
            preliminary_language = vh_dataset['5P']
            preliminary_fst = FST(preliminary_language)
            if fst.name == "Uyghur backness harmony":
                return True, preliminary_fst.step(word_as_list)
            elif fst.name == "Uyghur plural suffix -lVr":
                word_as_list = word_as_list[:-3]
                return True, preliminary_fst.step(word_as_list)
            elif fst.name == 'Uyghur dative suffix -' + U_F_V + 'V':
                word_as_list = word_as_list[:-2]
                return True, preliminary_fst.step(word_as_list)
        elif fst.name == "Halh (Mongolic) rounding harmony":
            # run the preprocess step
            preliminary_language = vh_dataset['8P']
            preliminary_fst = FST(preliminary_language)
            return True, preliminary_fst.step(word_as_list)

        elif fst.name == "Kalmyk (Oirat) harmony":
            '''AI---------------------------------------------------------------
                Rules for language 17
            ---------------------------------------------------------------AI'''
            # If there is a suffix, hyphenate
            # Check if i is the only type of vowel in the stem
            #     if not, run stem + suffix through 17
            #     if yes, don't process the stem. Input the suffix into 17P.
            #       Output is the stored stem + output of 17P
            alphabet_as_set = set(fst.alphabet)
            vowels_in_stem = set()
            for ch in stem_as_list:
                if ch in fst.alphabet:
                    vowels_in_stem.add(ch)
            if vowels_in_stem.intersection(alphabet_as_set) != {'i'}:
                return True, word_as_list
            else:
                if suffix_as_list != []:
                    preliminary_language = vh_dataset['17P']
                    preliminary_fst = FST(preliminary_language)
                    fst.suffix = preliminary_fst.step(suffix_as_list[0])
                return False, stem_as_list + fst.suffix
        elif fst.name in {
                "Maasai (Eastern Nilotic) ATR harmony",
                "Nawuri (North Guang) ATR harmony"
        }:
            stem_as_list.insert(0, '!')
            stem_as_list.append('&')
            return True, prefix_as_list + stem_as_list + suffix_as_list
        elif fst.name == "Kashaya (Pomoan) translaryngeal harmony":
            fst_p = vh_dataset['29P']
            return True, fst_p.step(word_as_list)
        elif fst.name == "Standard Hungarian palatal harmony of alternating suffixes":
            stem_as_list.insert(0, '!')
            return True, prefix_as_list + stem_as_list + suffix_as_list
        elif fst.name == "Tunica harmony":
            word_as_list.append('#')
            return True, word_as_list
        else:
            return True, word_as_list

    # Preprocessing for right-subsequential languages
    else:

        # languages 9,10,11,12,13
        if fst.name in {
                'Jingulu nominal root with non-neuter gender suffix',
                'Jingulu adjectivial root with non-neuter gender suffix',
                'Jingulu verbal root with subject agreement-marking suffix',
                'Jingulu verbal root with motion-imperative suffix',
                'Jingulu verbal root with negative imperative suffix',
        }:
            return True, word_as_list[::-1]
            # if not "u" in fst.suffix[0] and not "i" in fst.suffix[0]:
            #     return False, word_as_list
            # else:
            #     suffix_start = word_as_list.index('-')
            #     return True, word_as_list[:suffix_start][::-1]
        elif fst.name == 'Yoruba ATR harmony':
            return True, word_as_list[::-1]
        elif fst.name == "Kalenjin ATR harmony":
            language = vh_dataset[24]

            def add_delimiters(affix_list):
                for affix in affix_list:
                    if affix in language['n-a_suff']:
                        affix.insert(0, '&')
                    elif affix in language['n-a_r&pre']:
                        affix.append('!')
                return affix_list

            # prefix pre-processing
            prefix_as_list = add_delimiters(prefix_as_list)
            suffix_as_list = add_delimiters(suffix_as_list)
            stem_as_list = add_delimiters(stem_as_list)
            prefix_flat = []
            [prefix_flat.extend(prefix) for prefix in prefix_as_list]
            prefix_as_list = prefix_flat
            if len(suffix_as_list) >= 1:
                suffix_to_add = suffix_as_list[0]
            else:
                suffix_to_add = []
            return True, (prefix_as_list + stem_as_list + suffix_to_add)[::-1]
            # h e h i + h i + h i + a w q - a e
        elif fst.name==\
            "Asturian Lena (Romance) height harmony with inflectional suffixes":
            return True, word_as_list[::-1]
        return True, word_as_list[::-1]