def get_diac(analysis): diacritized = [] for sent in analysis: for word in sent.words: diac_word = sent.analysis[word]['diac'] diacritized.append(diac_word) return recompose(' '.join(diacritized))
def apply_mle_translit_simple_backoff(sentence, mledict, locmap, locexceptional): predict_mle = [] for tok in tokenize_skiphyph(sentence).split(): mle_tok = mledict.get( tok, translit_rules.translit_simple(tok, locmap, locexceptional)) predict_mle.append(mle_tok) return recompose(' '.join(predict_mle), mode='rom')
def apply_mle_translit_morph_backoff(sentence, mledict, backoff_sentences, sent_index): predict_mle = [] for tok_idx, tok in enumerate(tokenize_skiphyph(sentence).split()): if tok in mledict: mle_tok = mledict[tok] else: bckf_sent = backoff_sentences[sent_index] bckf_tokens = tokenize_skiphyph(bckf_sent).split() mle_tok = bckf_tokens[tok_idx] predict_mle.append(mle_tok) return recompose(' '.join(predict_mle), mode='rom')
def apply_mle_translit_simple_backoff(sentence, mledict, locmap, locexceptional): predict_mle = [] for tok_index, tok in enumerate(tokenize_skiphyph(sentence).split()): mle_tok = mledict.get( tok, translit_rules.translit_simple(tok, locmap, locexceptional, single_word_dont_capitalize=True)) if tok_index == 0: mle_tok = translit_rules.capitalize_loc(mle_tok) predict_mle.append(mle_tok) return recompose(' '.join(predict_mle), mode='rom')
def apply_mle(sentence, mledict): predict_mle = [] for tok in tokenize_skiphyph(sentence).split(): mle_tok = mledict.get(tok, 'OOV') predict_mle.append(mle_tok) return recompose(' '.join(predict_mle), mode='rom')
def translit_morph( mada_sentnece_object, loc_mapdict, exceptional_spellings, logger=rules_logger ): #TODO: 1) break up into smaller functions. 2) turn transliterator into class with constructor '''takes a madaSentenceObject from parse_analyser_output()''' capschar = '±' words = mada_sentnece_object.words # list of words toks = mada_sentnece_object.toks # list of words which are diacritized and affix tokenized sentence_analysis = mada_sentnece_object.analysis # dictionary with word keys and analysis values modified_words = [] # loop through words in mada_sentence_object (made of lists of words/tokenized_words and dictionary of analyeses for each word for tidx in range(len(toks)): #or len(words) # access word and tok (i.e tokenized word) by index word = words[tidx] tok = toks[tidx] # access analysis by dictionary lookup of word in sentence_analysis analysis = sentence_analysis[word] # handle exceptional spelling, skipping rest of loop if word in exceptional_spellings: logger.count_exceptional_rules((word, exceptional_spellings[word])) tok = exceptional_spellings[word] # capitalization NOTE: has to be done here for exceptional spellings as well as after modifications #capitalize sentence initial token TODO: check all places where capschar may duplicate if tidx == 0 and not tok.endswith(capschar): logger.count_morph_rules(('index-0 capitalize', 'capitalized')) tok = tok + capschar modified_words.append(tok) continue ## beginning of morph rules # look ahead if tidx < len(toks) - 1: nextword = words[tidx + 1] nexttok = toks[tidx + 1] nextanalysis = sentence_analysis[nextword] else: nextword = None nexttok = None nextanalysis = None bw = analysis['bw'] bwsplit = bw.split('+') bwending = bwsplit[-1] bwbeginning = bwsplit[0] # rule 1 'lil' ## keep lil instead of li-al if analysis['prc0'] == 'Al_det' and analysis['prc1'] == 'li_prep': logger.count_morph_rules(('li+al', 'lil')) find = re.escape('لِ+ال') tok = re.sub(find, r'لِل', tok) #TODO: fix MADAMIRA bug for la prep thats supposed to be li, e.g: 'لنباتات': {'bw': 'la/PREP+nabAt/NOUN+At/NSUFF_FEM_PL+i/CASE_DEF_GEN', # 'gloss': 'plants;vegetation', # 'diac': 'لَنَباتاتِ', # 'lemma': 'نَبات_1', ## remove case endings # if word ends with direct object or possessive pronoun do nothing if ('DO' in bwending) or ('POSS_PRON' in bwending): logger.count_morph_rules(('case-ending', 'kept')) pass # elif word ends with case ending, nominall suffix, or a verb (imperfective, perfective, and command) elif ('CASE' in bwending) or ('IV' in bwending) or ( 'PV' in bwending) or ('CV' in bwending) or ('NSUFF' in bwending): logger.count_morph_rules(('case-ending', 'removed')) # remove final diacritic, including alif for tanween tok = re.sub(r'اً(±)?$', r'\1', tok) #alif tanween must be first tok = re.sub(r'[ًٌٍَُِ](±)?$', r'\1', tok) ## ta marbuta handling # spell ta-marbuta if its in construct state if 'ة' in tok: logger.count_morph_rules(('ta-marbuta', 'total')) if analysis['stt'] == 'c': # caveat: cannot be construct if followed by prep (additional rule for handling odd madamira analysis) if nextanalysis and 'PREP' in nextanalysis['bw'].split('+')[0]: logger.count_morph_rules( ('ta-marbuta', 'not-construct (followed by prep)')) pass else: logger.count_morph_rules(('ta-marbuta', 'construct')) # put a sukun on the ta-marbuta for transliterator to spell it tok = re.sub(r'ة', r'ةْ', tok) ## split single letter proclitic # splitprepositions .. currently handling ب and ل only if 'PREP' in bwbeginning and len( bwsplit ) > 1: # length condition to make sure letters are actual proclitics an = analysis['lemma'].split('_')[0] if an in {'لِ-', 'بِ'}: logger.count_morph_rules(('single-letter clitic', 'split')) tok = re.sub(r'([لب][َُِ]?)', r'\1-', tok) # capitalization #capitalize sentence initial token TODO: check all places where capschar may duplicate if tidx == 0 and not tok.endswith(capschar): logger.count_morph_rules(('index-0 capitalize', 'capitalized')) tok = tok + capschar ##conditions for capitalizing non-initial tokens elif tidx > 0: # look back variable before = toks[tidx - 1] # capitalize after period, (a simple sentence segmenter) TODO: what happens when periods are not sentence markers such as acronyms etc if before == '.': logger.count_morph_rules(('after . capitalize', 'capitalized')) tok = tok + capschar # capitalize if gloss is capitalized and pos is proper noun or adjective (and word is not arabic punctuation and not capitalized for other reasons) if analysis['pos'] in { 'noun_prop', 'adj' } and analysis['gloss'] and analysis['gloss'][0].isupper( ) and word not in {'،', '؛'} and not tok.endswith(capschar): # print(tok) logger.count_morph_rules( ('adj/nounprop capitalized gloss', 'capitalized')) tok = tok + capschar #elif pos in nounprop NOTE: this is to make sure proper nouns are capitalized regardless of gloss and other conditions but maybe better to collapse with previous condition elif analysis['pos'] in {'noun_prop' } and not tok.endswith(capschar): logger.count_morph_rules(('nounprop', 'capitalized')) tok = tok + capschar ## append modified spellings to token holder # tokenization marker replacement. if '+' in tok: tmp = tok.replace('+', '- ').split(' ') modified_words += tmp else: modified_words.append(tok) # converting everything transliterated_words = [] for tok in modified_words: if tok: # capitalize NOTE: THEN tranlisterate so as to remove final capschar for proper transliteration if tok.endswith(capschar): tok = tok[:-1] transliterated_tok = translit(tok, loc_mapdict) transliterated_tok = capitalize_loc(transliterated_tok) else: transliterated_tok = translit(tok, loc_mapdict) # capitalize if capschar at the end transliterated_words.append(transliterated_tok) else: if tok == '': pass else: print( f'whats this tok?: <{tok}>' ) #TODO: remove this since it doesn't seem to be doing anything transliterated_sentence = ' '.join(transliterated_words) return recompose(transliterated_sentence, mode='rom')