def corpus2dictionary(corpus, project_name):
    pronunciation_dict = {}
    phones_list = set()
    phones_list.add('SIL')
    for line in corpus:
        if args.s_tag:
            sentence = re.search('<s>(.*)</s>', line).group(1)
        else:
            sentence = line
        words = sentence.split()
        for word in words:
            pronunciations = phonetise_Arabic.phonetise_word(word)
            for pronunciation in pronunciations:
                for phone in pronunciation.split():
                    phones_list.add(phone)
            clean_word = arabic_utils.remove_diacritics(word)
            if clean_word in pronunciation_dict:
                for pronunciation in pronunciations:
                    pronunciation_dict[clean_word].add(pronunciation)
            else:
                pronunciation_dict[clean_word] = set()
                for pronunciation in pronunciations:
                    pronunciation_dict[clean_word].add(pronunciation)

    print('writing dic file')
    with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer:
        for w, phones in sorted(pronunciation_dict.items()):
            for i, phone in enumerate(phones):
                if i == 0:
                    dict_writer.write('{}\t\t{}\n'.format(w, phone))
                else:
                    dict_writer.write('{}({})\t\t{}\n'.format(
                        w, (i + 1), phone))

    print('writing phone file')
    with open(proj_name + '.phone', mode='w',
              encoding='utf-8') as phone_writer:
        for ph in sorted(phones_list):
            phone_writer.write(ph)
            phone_writer.write('\n')
def corpus2dictionary(corpus, project_name):
    pronunciation_dict = {}
    pronunciation_dict_cleaned={}
    phones_list = set()
    phones_list.add('SIL')
    for line in corpus:
        if args.s_tag:
            sentence = re.search('<s>(.*)</s>', line).group(1)
        else:
            sentence = line
        words = sentence.split()
        for word in words:
            pronunciations = phonetise_Arabic.phonetise_word(word)
            for pronunciation in pronunciations:
                for phone in pronunciation.split():
                    phones_list.add(phone)

            def addWordToDictionary(localWord, dic):
                if localWord not in dic:
                    dic[localWord] = []
                dic[localWord].extend(pronunciations)
            cleaned_word = arabic_utils.remove_diacritics(word)
            addWordToDictionary(word, pronunciation_dict)
            addWordToDictionary(cleaned_word, pronunciation_dict_cleaned)

    calculateFrequencies(pronunciation_dict)
    calculateFrequencies(pronunciation_dict_cleaned)

    print('writing 2 dic files')
    writeFile(pronunciation_dict, proj_name + '_moshakal.dic')
    writeFile(pronunciation_dict_cleaned, proj_name + '_cleaned.dic')

    print('writing phone file')
    with open(proj_name + '.phone', mode='w', encoding='utf-8') as phone_writer:
        for ph in sorted(phones_list):
            phone_writer.write(ph)
            phone_writer.write('\n')
                    help='project name',
                    required=True)

if __name__ == '__main__':
    args = parser.parse_args()
    lines = args.input.readlines()
    proj_name = args.project_name
    cmu_dict = {}
    phones_set = set()
    ad = AlphabetDetector()
    for line in lines:
        if not line.strip():
            continue
        word = line.split()[0]
        phones = ' '.join(line.split()[1:])
        arabic_word = arabic_utils.remove_diacritics(buckwalterToArabic(word))
        print(word, arabic_word)
        if arabic_word in cmu_dict:
            cmu_dict[arabic_word].add(phones)
        else:
            cmu_dict[arabic_word] = {phones}
            for ph in phones.split():
                phones_set.add(ph)

    print('writing dic file')
    with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer:
        for w, ph in sorted(cmu_dict.items()):
            if len(ph) == 1:
                dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))
            else:
                dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))
Пример #4
0
def phonetise_word(arabic_word):
    utterances = [arabic_word]
    arabic_word = arabic_utils.remove_diacritics(arabic_word)
    result = ''  # Pronunciations Dictionary
    utterances_pronunciations = [
    ]  # Most likely pronunciation for all utterances
    utterances_pronunciations_with_boundaries = [
    ]  # Most likely pronunciation for all utterances
    pronunciations = []
    phones = []
    # -----------------------------------------------------------------------------------------------------
    # Loop through utterances------------------------------------------------------------------------------
    # -----------------------------------------------------------------------------------------------------
    utterance_number = 1
    for utterance in utterances:
        utterance_number += 1
        utterances_pronunciations.append(
            '')  # Add empty entry that will hold this utterance's pronuncation
        # Add empty entry that will hold this utterance's pronuncation
        utterances_pronunciations_with_boundaries.append('')

        utterance = convert(utterance)
        # ---------------------------
        word_index = -1

        # Loop through words
        for word in utterance:
            word_index += 1
            if word not in [u'-', u'sil']:
                pronunciations = [
                ]  # Start with empty set of possible pronunciations of current word
                # Add fixed irregular pronunciations if possible
                result = isFixedWord2(word, result, word, pronunciations)
                # Indicates whether current character is in an emphatic context or not. Starts with False
                emphaticContext = False
                word = u'##' + word + u'##'  # This is the end/beginning of word symbol. just for convenience

                phones = [
                ]  # Empty list which will hold individual possible word's pronunciation

                # -----------------------------------------------------------------------------------
                # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts--
                # -----------------------------------------------------------------------------------
        # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts--
        # -----------------------------------------------------------------------------------
        for index in range(2, len(word) - 2):
            letter = word[index]  # Current Character
            nextCharacter = word[index + 1]  # Next Character
            afterNextCharacter = word[index + 2]  # Next-Next Character
            previousCharacter = word[index - 1]  # Previous Character
            beforePreviousCharacter = word[index -
                                           2]  # Before Previous Character

            emphaticContext = emphatic_context.getState(letter, nextCharacter)
            if letter in constants.unambiguousConsonantMap:
                phones.append(constants.unambiguousConsonantMap[letter])
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'l':  # Lam is a consonant which requires special treatment
                phones += handle_characters.lam(beforePreviousCharacter,
                                                previousCharacter,
                                                nextCharacter,
                                                afterNextCharacter)
            # ----------------------------------------------------------------------------------------------------------------
            # shadda just doubles the letter before it
            if letter == u'~' and previousCharacter not in [
                    u'w', u'y'
            ] and len(phones) > 0:
                phones[-1] += phones[-1]
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'|':  # Madda only changes based in emphaticness
                phones += handle_characters.madda(emphatic_context)
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'p':  # Ta' marboota is determined by the following if it is a diacritic or not
                phones += handle_characters.p(nextCharacter)

            if letter in constants.vowelMap:
                # Waw and Ya are complex they could be consonants or vowels and their gemination is complex as
                # it could be a combination of a vowel and consonants
                phones += handle_characters.handle_vowels(
                    previousCharacter, letter, nextCharacter,
                    afterNextCharacter, emphaticContext)
                # Kasra and Damma could be mildened if before a final silent consonant
                if letter in [u'u', u'i']:
                    phones += handle_characters.kasra_and_damma(
                        word, letter, emphaticContext, nextCharacter,
                        afterNextCharacter)
                # Alif could be ommited in definite article and beginning of some words
                if letter in [u'a', u'A', u'Y']:
                    phones += handle_characters.alef(beforePreviousCharacter,
                                                     previousCharacter, letter,
                                                     nextCharacter,
                                                     emphaticContext)
    pronunciations += get_different_possible_pronounciations(phones)
    pronunciations = remove_duplicates(pronunciations)

    return [
        ' '.join(item) for item in pronunciations
        if len(item) >= len(arabic_word)
    ]
                    help='project name',
                    required=True)

if __name__ == '__main__':
    args = parser.parse_args()
    lines = args.input.readlines()
    proj_name = args.project_name
    cmu_dict = {}
    phones_set = set()
    ad = AlphabetDetector()
    for line in lines:
        if not line.strip():
            continue
        word = line.split()[0]
        phones = ' '.join(line.split()[1:])
        arabic_word = arabic_utils.remove_diacritics(
            phonetise_Arabic.buckwalterToArabic(word))
        print(word, arabic_word)
        if arabic_word in cmu_dict:
            cmu_dict[arabic_word].add(phones)
        else:
            cmu_dict[arabic_word] = {phones}
            for ph in phones.split():
                phones_set.add(ph)

    print('writing dic file')
    with open(proj_name + '.dic', mode='w', encoding='utf-8') as dict_writer:
        for w, ph in sorted(cmu_dict.items()):
            if len(ph) == 1:
                dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))
            else:
                dict_writer.write('{}\t\t{}\n'.format(w, ph.pop()))