def corpus2dictionary(corpus, project_name): pronunciation_dict = {} phones_list = set() phones_list.add('SIL') for line in corpus: if args.s_tag: sentence = re.search('<s>(.*)</s>', line).group(1) else: sentence = line words = sentence.split() for word in words: pronunciations = phonetise_Arabic.phonetise_word(word) for pronunciation in pronunciations: for phone in pronunciation.split(): phones_list.add(phone) clean_word = arabic_utils.remove_diacritics(word) if clean_word in pronunciation_dict: for pronunciation in pronunciations: pronunciation_dict[clean_word].add(pronunciation) else: pronunciation_dict[clean_word] = set() for pronunciation in pronunciations: pronunciation_dict[clean_word].add(pronunciation) print('writing dic file') with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer: for w, phones in sorted(pronunciation_dict.items()): for i, phone in enumerate(phones): if i == 0: dict_writer.write('{}\t\t{}\n'.format(w, phone)) else: dict_writer.write('{}({})\t\t{}\n'.format( w, (i + 1), phone)) print('writing phone file') with open(proj_name + '.phone', mode='w', encoding='utf-8') as phone_writer: for ph in sorted(phones_list): phone_writer.write(ph) phone_writer.write('\n')
def corpus2dictionary(corpus, project_name): pronunciation_dict = {} pronunciation_dict_cleaned={} phones_list = set() phones_list.add('SIL') for line in corpus: if args.s_tag: sentence = re.search('<s>(.*)</s>', line).group(1) else: sentence = line words = sentence.split() for word in words: pronunciations = phonetise_Arabic.phonetise_word(word) for pronunciation in pronunciations: for phone in pronunciation.split(): phones_list.add(phone) def addWordToDictionary(localWord, dic): if localWord not in dic: dic[localWord] = [] dic[localWord].extend(pronunciations) cleaned_word = arabic_utils.remove_diacritics(word) addWordToDictionary(word, pronunciation_dict) addWordToDictionary(cleaned_word, pronunciation_dict_cleaned) calculateFrequencies(pronunciation_dict) calculateFrequencies(pronunciation_dict_cleaned) print('writing 2 dic files') writeFile(pronunciation_dict, proj_name + '_moshakal.dic') writeFile(pronunciation_dict_cleaned, proj_name + '_cleaned.dic') print('writing phone file') with open(proj_name + '.phone', mode='w', encoding='utf-8') as phone_writer: for ph in sorted(phones_list): phone_writer.write(ph) phone_writer.write('\n')
help='project name', required=True) if __name__ == '__main__': args = parser.parse_args() lines = args.input.readlines() proj_name = args.project_name cmu_dict = {} phones_set = set() ad = AlphabetDetector() for line in lines: if not line.strip(): continue word = line.split()[0] phones = ' '.join(line.split()[1:]) arabic_word = arabic_utils.remove_diacritics(buckwalterToArabic(word)) print(word, arabic_word) if arabic_word in cmu_dict: cmu_dict[arabic_word].add(phones) else: cmu_dict[arabic_word] = {phones} for ph in phones.split(): phones_set.add(ph) print('writing dic file') with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer: for w, ph in sorted(cmu_dict.items()): if len(ph) == 1: dict_writer.write('{}\t\t{}\n'.format(w, ph.pop())) else: dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))
def phonetise_word(arabic_word): utterances = [arabic_word] arabic_word = arabic_utils.remove_diacritics(arabic_word) result = '' # Pronunciations Dictionary utterances_pronunciations = [ ] # Most likely pronunciation for all utterances utterances_pronunciations_with_boundaries = [ ] # Most likely pronunciation for all utterances pronunciations = [] phones = [] # ----------------------------------------------------------------------------------------------------- # Loop through utterances------------------------------------------------------------------------------ # ----------------------------------------------------------------------------------------------------- utterance_number = 1 for utterance in utterances: utterance_number += 1 utterances_pronunciations.append( '') # Add empty entry that will hold this utterance's pronuncation # Add empty entry that will hold this utterance's pronuncation utterances_pronunciations_with_boundaries.append('') utterance = convert(utterance) # --------------------------- word_index = -1 # Loop through words for word in utterance: word_index += 1 if word not in [u'-', u'sil']: pronunciations = [ ] # Start with empty set of possible pronunciations of current word # Add fixed irregular pronunciations if possible result = isFixedWord2(word, result, word, pronunciations) # Indicates whether current character is in an emphatic context or not. Starts with False emphaticContext = False word = u'##' + word + u'##' # This is the end/beginning of word symbol. just for convenience phones = [ ] # Empty list which will hold individual possible word's pronunciation # ----------------------------------------------------------------------------------- # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts-- # ----------------------------------------------------------------------------------- # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts-- # ----------------------------------------------------------------------------------- for index in range(2, len(word) - 2): letter = word[index] # Current Character nextCharacter = word[index + 1] # Next Character afterNextCharacter = word[index + 2] # Next-Next Character previousCharacter = word[index - 1] # Previous Character beforePreviousCharacter = word[index - 2] # Before Previous Character emphaticContext = emphatic_context.getState(letter, nextCharacter) if letter in constants.unambiguousConsonantMap: phones.append(constants.unambiguousConsonantMap[letter]) # ---------------------------------------------------------------------------------------------------------------- if letter == u'l': # Lam is a consonant which requires special treatment phones += handle_characters.lam(beforePreviousCharacter, previousCharacter, nextCharacter, afterNextCharacter) # ---------------------------------------------------------------------------------------------------------------- # shadda just doubles the letter before it if letter == u'~' and previousCharacter not in [ u'w', u'y' ] and len(phones) > 0: phones[-1] += phones[-1] # ---------------------------------------------------------------------------------------------------------------- if letter == u'|': # Madda only changes based in emphaticness phones += handle_characters.madda(emphatic_context) # ---------------------------------------------------------------------------------------------------------------- if letter == u'p': # Ta' marboota is determined by the following if it is a diacritic or not phones += handle_characters.p(nextCharacter) if letter in constants.vowelMap: # Waw and Ya are complex they could be consonants or vowels and their gemination is complex as # it could be a combination of a vowel and consonants phones += handle_characters.handle_vowels( previousCharacter, letter, nextCharacter, afterNextCharacter, emphaticContext) # Kasra and Damma could be mildened if before a final silent consonant if letter in [u'u', u'i']: phones += handle_characters.kasra_and_damma( word, letter, emphaticContext, nextCharacter, afterNextCharacter) # Alif could be ommited in definite article and beginning of some words if letter in [u'a', u'A', u'Y']: phones += handle_characters.alef(beforePreviousCharacter, previousCharacter, letter, nextCharacter, emphaticContext) pronunciations += get_different_possible_pronounciations(phones) pronunciations = remove_duplicates(pronunciations) return [ ' '.join(item) for item in pronunciations if len(item) >= len(arabic_word) ]
help='project name', required=True) if __name__ == '__main__': args = parser.parse_args() lines = args.input.readlines() proj_name = args.project_name cmu_dict = {} phones_set = set() ad = AlphabetDetector() for line in lines: if not line.strip(): continue word = line.split()[0] phones = ' '.join(line.split()[1:]) arabic_word = arabic_utils.remove_diacritics( phonetise_Arabic.buckwalterToArabic(word)) print(word, arabic_word) if arabic_word in cmu_dict: cmu_dict[arabic_word].add(phones) else: cmu_dict[arabic_word] = {phones} for ph in phones.split(): phones_set.add(ph) print('writing dic file') with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer: for w, ph in sorted(cmu_dict.items()): if len(ph) == 1: dict_writer.write('{}\t\t{}\n'.format(w, ph.pop())) else: dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))