Python Epitran примеры использования

Язык программирования: Python

Пространство имен/Пакет: epitran

Класс/Тип: Epitran

Примеров на hotexamples.com: 13

Python Epitran - 13 примеров найдено. Это лучшие примеры Python кода для epitran.Epitran, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Epitran(12)

transliterate(4)

trans_list(1)

word_to_tuples(1)

Пример #1

Показать файл

Файл: tests.py Проект: FlavioAMiceli/phonemes

def epi_test():
    # Just look at Dutch phonemes
    epi = Epitran('nld-Latn')
    print(epi.transliterate('Werkt'))

    # Look at English phonemes first, also consider Dutch.
    # Here to see if flite works.
    backoff = Backoff(['eng-Latn', 'nld-Latn'])
    print(backoff.transliterate('Werkt'))

Пример #2

Показать файл

Файл: tests.py Проект: FlavioAMiceli/phonemes

def compare_epi_to_dict():
    dict_file = open('g2p_dictionary/dutch_dic_to_phonetic.1.json', 'r')
    g2p_dict = json.load(dict_file)
    epi = Epitran('nld-Latn')
    n = 0
    for key in g2p_dict:
        dic_word = g2p_dict[key]
        epi_word = epi.transliterate(key)
        if (dic_word != epi_word):
            n += 1
            # fstring = "Key: %s\nDic: %s\nEpi: %s\n" % (key, dic_word, epi_word)
            # print (fstring)
    print(n)

Пример #3

Показать файл

    def __init__(self, code, space_names):
        """Constructs VectorWithIPASpace object

        A VectorWithIPASpace object takes orthographic words, via the
        word_to_segs method, and returns a list of tuples consisting of category
        (letter or punctuation), lettercaase, orthographic form, phonetic form,
        id within an IPA space, and articulatory feature vector.

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
                                codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.space = Space(code, space_names)

Пример #4

Показать файл

def get_tgt_code_and_transcriber(
        target: str,
        pron_dict: Optional[dict] = None,
        need_transcriber: bool = True) -> Tuple[str, G2P_func]:
    if target == 'roa-opt':
        tgt_code = 'roa_opt'
    else:
        tgt_code = lookup(target).alpha_3

    if not need_transcriber:
        tgt_g2p = None
    # Use epitran.
    elif pron_dict is None:
        if tgt_code in [
                'ita', 'spa', 'por', 'fra', 'cat', 'ron', 'deu', 'nld', 'swe'
        ]:
            epi_code = f'{tgt_code}-Latn'
        else:
            raise ValueError(f'language {target} not supported.')
        tgt_g2p = Epitran(epi_code).transliterate
    # Use pronunciation dictionary.
    else:
        # Return None if entry not found.
        tgt_g2p = lambda token: pron_dict.get((tgt_code, token), None)

    return tgt_code, tgt_g2p

Пример #5

Показать файл

Файл: n_gram.py Проект: FlavioAMiceli/phonemes

	def get_data_stream(self, min_count=1, min_length=1):
		with open(self._file, 'r') as text, \
			open(self._dict_file, 'r') as g2p_file:
			g2p_dict = json.load(g2p_file)
			g2p_dict = {k.lower():v for k,v in g2p_dict.items()}
			epi = Epitran(self._epi_code)

			for line in text:
				list_phonetic_words = self.transliterate_line(line, g2p_dict, epi)
				if (self.validate_line(list_phonetic_words, min_count, min_length)):
					yield (list_phonetic_words)

Пример #6

Показать файл

Файл: n_gram.py Проект: FlavioAMiceli/phonemes

	def set_corpus(self):
		with open(self._file, 'r') as text, \
			open(self._dict_file, 'r') as g2p_file:
			g2p_dict = json.load(g2p_file)
			g2p_dict = {k.lower():v for k,v in g2p_dict.items()}
			epi = Epitran(self._epi_code)

			for line in text:
				line_phonetic = self.transliterate_line(line, g2p_dict, epi)
				for word in line_phonetic:
					self._count_table[word] += 1

Пример #7

Показать файл

def main():
    parser = argparse.ArgumentParser(prog="words2ipa.py")
    parser.add_argument("language", help="epitran language code (e.g., eng-Latn)")
    parser.add_argument(
        "--print-word", action="store_true", help="Print word before IPA"
    )
    parser.add_argument("--sep", help="Separator between IPA symbols (default: none)")
    args = parser.parse_args()

    e = Epitran(args.language)

    print("Reading words from stdin...", file=sys.stderr)
    for word in sys.stdin:
        word = word.strip()
        if word:
            if args.print_word:
                print(word, end=" ")

            ipa = e.trans_list(word)
            if args.sep:
                print(args.sep.join(ipa))
            else:
                print("".join(ipa))

Пример #8

Показать файл

class VectorsWithIPASpace(object):
    def __init__(self, code, space_names):
        """Constructs VectorWithIPASpace object

        A VectorWithIPASpace object takes orthographic words, via the
        word_to_segs method, and returns a list of tuples consisting of category
        (letter or punctuation), lettercaase, orthographic form, phonetic form,
        id within an IPA space, and articulatory feature vector.

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
                                codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.space = Space(code, space_names)

    def word_to_segs(self, word, normpunc=False):
        """Returns feature vectors, etc. for segments and punctuation in a word

        Args:
            word (unicode): Unicode string representing a word in the
                            orthography specified when the class is
                            instantiated
            normpunc (bool): normalize punctuation

        Returns:
            list: a list of tuples, each representing an IPA segment or a
                  punctuation character. Tuples consist of <category, lettercase,
                  orthographic_form, phonetic_form, id, feature_vector>.

                  Category consists of the standard Unicode classes (e.g. 'L'
                  for letter and 'P' for punctuation). Case is binary: 1 for
                  uppercase and 0 for lowercase.
        """
        segs = self.epi.word_to_tuples(word, normpunc)
        new_segs = []
        for cat, case, orth, phon, id_vec_list in segs:
            if not phon and normpunc:
                if orth in self.epi.puncnorm:
                    orth = self.epi.puncnorm[orth]
            for s, vector in id_vec_list:
                if s in self.space:
                    id_ = int(self.space[s])
                elif orth in self.space:
                    id_ = int(self.space[orth])
                else:
                    id_ = -1
                new_segs.append((cat, case, orth, phon, id_, vector))
        return new_segs

Пример #9

Показать файл

Файл: space.py Проект: wcr2000/OCR_LPN

    def __init__(self, code, space_names):
        """Construct a Space object

        Space objects take strings (corresponding to segments) and return
        integers, placing them in an integer space that can be translated into
        a one-hot vector.

        The resulting object has a dictionary-like interface that supports
        indexing and iteration over "keys".

        Args:
            code (str): ISO 639-3 code joined to ISO 15924 code with "-"
            space_names (list): list of space names consisting of ISO 639-3
            codes joined to ISO 15924 codes with "-"
        """
        self.epi = Epitran(code)
        self.dict = self._load_space(space_names)

Пример #10

Показать файл

def ger_to_ipa(text: str) -> str:
    if Language.GER not in _epitran_cache.keys():
        _epitran_cache[Language.GER] = Epitran('deu-Latn')
    result = _epitran_cache[Language.GER].transliterate(text)
    return result

Пример #11

Показать файл

def en_to_ipa(text: str) -> str:
    if Language.ENG not in _epitran_cache.keys():
        _epitran_cache[Language.ENG] = Epitran('eng-Latn')
    result = _epitran_cache[Language.ENG].transliterate(text)
    return result

Пример #12

Показать файл

from epitran import Epitran
from util import lang2ISO
import codecs
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Clean Data")
    parser.add_argument('fn', metavar='fn')
    parser.add_argument('lang', metavar='lang')

    args = parser.parse_args()
    fn = args.fn
    lang = args.lang

    with codecs.open(fn, "r", encoding='utf-8') as file:
        lines = [l.strip().split('\t') for l in\
                 file]

        iso = lang2ISO(lang)
        epi = Epitran(iso)
        for lemma, wf, tags in lines:
            if lemma.isdigit() or wf.isdigit():
                print("Digit! %s, %s" % (lemma, wf))
            elif epi.transliterate(lemma) and epi.transliterate(wf):
                pass
            else:
                print("Cannot transliterate! %s, %s" % (lemma, wf))

Пример #13

Показать файл

     to_rectify = [('g', 'ɡ'), ('gʷ', 'ɡʷ'), ('h', 'x'), ('hʷ', 'xʷ'),
                   ('ɛ', 'e'), ('ɣ', 'ɡ'), ('ɔ', 'o')]
     non_transcriber = OldNorseTranscription()
     desc[ipa_col] = desc[form_col].apply(
         lambda s: non_transcriber.transcribe(s).strip('[]')).apply(
             i2t).apply(lambda lst: [replace(x, to_rectify) for x in lst])
 elif lang in ['it', 'es', 'fr', 'uk', 'pl', 'ru']:
     lang2code = {
         'it': 'ita-Latn',
         'es': 'spa-Latn',
         'fr': 'fra-Latn',
         'ru': 'rus-Cyrl',
         'uk': 'ukr-Cyrl',
         'pl': 'pol-Latn'
     }
     transcriber = Epitran(lang2code[lang])
     ipa_col = f'{lang}_ipa'
     form_col = 'desc_form'
     # Italian doesn't have phonemic diphthongs.
     merge_vowels = lang != 'it'
     desc[ipa_col] = desc[form_col].apply(
         lambda s: i2t(transcriber.transliterate(s).replace('ˈ', '').
                       replace('ˌ', '').replace("'", ''),
                       merge_vowels=merge_vowels))
     to_normalize = list()
     if lang == 'ru':
         to_normalize = [('á', 'a'), ('ó', 'o'), ('é', 'e'), ('ú', 'u'),
                         ('ɨ́', 'ɨ'), ('í', 'i'), ('t͡ɕʲ', 't͡ɕ'),
                         ('ʂʲ', 'ʂ')]
     elif lang == 'uk':
         to_normalize = [('ɑ́', 'ɑ'), ('ɔ́', 'ɔ'), ('ɛ́', 'ɛ'), ('í', 'i'),