Пример #1
0
def get_diac(analysis):
    diacritized = []
    for sent in analysis:
        for word in sent.words:
            diac_word = sent.analysis[word]['diac']
            diacritized.append(diac_word)
    return recompose(' '.join(diacritized))
Пример #2
0
def apply_mle_translit_simple_backoff(sentence, mledict, locmap,
                                      locexceptional):
    predict_mle = []
    for tok in tokenize_skiphyph(sentence).split():
        mle_tok = mledict.get(
            tok, translit_rules.translit_simple(tok, locmap, locexceptional))
        predict_mle.append(mle_tok)
    return recompose(' '.join(predict_mle), mode='rom')
Пример #3
0
def apply_mle_translit_morph_backoff(sentence, mledict, backoff_sentences,
                                     sent_index):
    predict_mle = []
    for tok_idx, tok in enumerate(tokenize_skiphyph(sentence).split()):
        if tok in mledict:
            mle_tok = mledict[tok]
        else:
            bckf_sent = backoff_sentences[sent_index]
            bckf_tokens = tokenize_skiphyph(bckf_sent).split()
            mle_tok = bckf_tokens[tok_idx]
        predict_mle.append(mle_tok)
    return recompose(' '.join(predict_mle), mode='rom')
Пример #4
0
def apply_mle_translit_simple_backoff(sentence, mledict, locmap,
                                      locexceptional):
    predict_mle = []
    for tok_index, tok in enumerate(tokenize_skiphyph(sentence).split()):
        mle_tok = mledict.get(
            tok,
            translit_rules.translit_simple(tok,
                                           locmap,
                                           locexceptional,
                                           single_word_dont_capitalize=True))

        if tok_index == 0:
            mle_tok = translit_rules.capitalize_loc(mle_tok)
        predict_mle.append(mle_tok)
    return recompose(' '.join(predict_mle), mode='rom')
Пример #5
0
def apply_mle(sentence, mledict):
    predict_mle = []
    for tok in tokenize_skiphyph(sentence).split():
        mle_tok = mledict.get(tok, 'OOV')
        predict_mle.append(mle_tok)
    return recompose(' '.join(predict_mle), mode='rom')
Пример #6
0
def translit_morph(
    mada_sentnece_object,
    loc_mapdict,
    exceptional_spellings,
    logger=rules_logger
):  #TODO: 1) break up into smaller functions. 2) turn transliterator into class with constructor
    '''takes a madaSentenceObject from parse_analyser_output()'''
    capschar = '±'
    words = mada_sentnece_object.words  # list of words
    toks = mada_sentnece_object.toks  # list of words which are diacritized and affix tokenized
    sentence_analysis = mada_sentnece_object.analysis  # dictionary with word keys and analysis values

    modified_words = []
    # loop through words in mada_sentence_object (made of lists of words/tokenized_words and dictionary of analyeses for each word

    for tidx in range(len(toks)):  #or len(words)

        # access word and tok (i.e tokenized word) by index
        word = words[tidx]
        tok = toks[tidx]
        # access analysis by dictionary lookup of word in sentence_analysis
        analysis = sentence_analysis[word]

        # handle exceptional spelling, skipping rest of loop
        if word in exceptional_spellings:
            logger.count_exceptional_rules((word, exceptional_spellings[word]))
            tok = exceptional_spellings[word]
            # capitalization                            NOTE: has to be done here for exceptional spellings as well as after modifications
            #capitalize sentence initial token          TODO: check all places where capschar may duplicate
            if tidx == 0 and not tok.endswith(capschar):
                logger.count_morph_rules(('index-0 capitalize', 'capitalized'))
                tok = tok + capschar

            modified_words.append(tok)
            continue

        ## beginning of morph rules

        # look ahead
        if tidx < len(toks) - 1:
            nextword = words[tidx + 1]
            nexttok = toks[tidx + 1]
            nextanalysis = sentence_analysis[nextword]
        else:
            nextword = None
            nexttok = None
            nextanalysis = None

        bw = analysis['bw']
        bwsplit = bw.split('+')
        bwending = bwsplit[-1]
        bwbeginning = bwsplit[0]

        # rule 1 'lil'
        ## keep lil instead of li-al
        if analysis['prc0'] == 'Al_det' and analysis['prc1'] == 'li_prep':
            logger.count_morph_rules(('li+al', 'lil'))
            find = re.escape('لِ+ال')
            tok = re.sub(find, r'لِل', tok)

        #TODO: fix MADAMIRA bug for la prep thats supposed to be li, e.g: 'لنباتات': {'bw': 'la/PREP+nabAt/NOUN+At/NSUFF_FEM_PL+i/CASE_DEF_GEN',
        #   'gloss': 'plants;vegetation',
        #   'diac': 'لَنَباتاتِ',
        #   'lemma': 'نَبات_1',

        ## remove case endings
        # if word ends with direct object or possessive pronoun do nothing
        if ('DO' in bwending) or ('POSS_PRON' in bwending):
            logger.count_morph_rules(('case-ending', 'kept'))
            pass
        # elif word ends with case ending, nominall suffix, or a verb (imperfective, perfective, and command)
        elif ('CASE' in bwending) or ('IV' in bwending) or (
                'PV' in bwending) or ('CV' in bwending) or ('NSUFF'
                                                            in bwending):
            logger.count_morph_rules(('case-ending', 'removed'))
            # remove final diacritic, including alif for tanween
            tok = re.sub(r'اً(±)?$', r'\1', tok)  #alif tanween must be first
            tok = re.sub(r'[ًٌٍَُِ](±)?$', r'\1', tok)

        ## ta marbuta handling
        # spell ta-marbuta if its in construct state
        if 'ة' in tok:
            logger.count_morph_rules(('ta-marbuta', 'total'))
            if analysis['stt'] == 'c':
                # caveat: cannot be construct if followed by prep (additional rule for handling odd madamira analysis)
                if nextanalysis and 'PREP' in nextanalysis['bw'].split('+')[0]:
                    logger.count_morph_rules(
                        ('ta-marbuta', 'not-construct (followed by prep)'))
                    pass
                else:
                    logger.count_morph_rules(('ta-marbuta', 'construct'))
                    # put a sukun on the ta-marbuta for transliterator to spell it
                    tok = re.sub(r'ة', r'ةْ', tok)

        ## split single letter proclitic
        # splitprepositions .. currently handling ب and ل only
        if 'PREP' in bwbeginning and len(
                bwsplit
        ) > 1:  # length condition to make sure letters are actual proclitics
            an = analysis['lemma'].split('_')[0]
            if an in {'لِ-', 'بِ'}:
                logger.count_morph_rules(('single-letter clitic', 'split'))
                tok = re.sub(r'([لب][َُِ]?)', r'\1-', tok)

        # capitalization
        #capitalize sentence initial token          TODO: check all places where capschar may duplicate
        if tidx == 0 and not tok.endswith(capschar):
            logger.count_morph_rules(('index-0 capitalize', 'capitalized'))
            tok = tok + capschar

        ##conditions for capitalizing non-initial tokens
        elif tidx > 0:
            # look back variable
            before = toks[tidx - 1]

            # capitalize after period, (a simple sentence segmenter) TODO: what happens when periods are not sentence markers such as acronyms etc
            if before == '.':
                logger.count_morph_rules(('after . capitalize', 'capitalized'))
                tok = tok + capschar

            # capitalize if gloss is capitalized and pos is proper noun or adjective (and word is not arabic punctuation and not capitalized for other reasons)
            if analysis['pos'] in {
                    'noun_prop', 'adj'
            } and analysis['gloss'] and analysis['gloss'][0].isupper(
            ) and word not in {'،', '؛'} and not tok.endswith(capschar):
                # print(tok)
                logger.count_morph_rules(
                    ('adj/nounprop capitalized gloss', 'capitalized'))
                tok = tok + capschar

            #elif pos in nounprop NOTE: this is to make sure proper nouns are capitalized regardless of gloss and other conditions but maybe better to collapse with previous condition
            elif analysis['pos'] in {'noun_prop'
                                     } and not tok.endswith(capschar):
                logger.count_morph_rules(('nounprop', 'capitalized'))
                tok = tok + capschar

        ## append modified spellings to token holder
        # tokenization marker replacement.
        if '+' in tok:
            tmp = tok.replace('+', '- ').split(' ')
            modified_words += tmp
        else:
            modified_words.append(tok)

    # converting everything
    transliterated_words = []
    for tok in modified_words:
        if tok:
            # capitalize  NOTE: THEN tranlisterate so as to remove final capschar for proper transliteration
            if tok.endswith(capschar):
                tok = tok[:-1]
                transliterated_tok = translit(tok, loc_mapdict)
                transliterated_tok = capitalize_loc(transliterated_tok)
            else:
                transliterated_tok = translit(tok, loc_mapdict)
            # capitalize if capschar at the end
            transliterated_words.append(transliterated_tok)
        else:
            if tok == '':
                pass
            else:
                print(
                    f'whats this tok?: <{tok}>'
                )  #TODO: remove this since it doesn't seem to be doing anything

    transliterated_sentence = ' '.join(transliterated_words)

    return recompose(transliterated_sentence, mode='rom')