Python StripDiacritics示例

编程语言: Python

命名空间/包名称: epitran.stripdiacritics

类/类型: StripDiacritics

hotexamples.com的示例: 5

Python StripDiacritics - 已找到5个示例。这些是从开源项目中提取的最受好评的epitran.stripdiacritics.StripDiacritics现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

StripDiacritics(2)

process(1)

示例#1

显示文件

    def __init__(self, code, preproc=True, postproc=True, ligatures=False,
                 rev=False, rev_preproc=True, rev_postproc=True):
        """Constructs the backend object epitran uses for most languages

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            preproc (bool): if True, apply preprocessor
            postproc (bool): if True, apply postprocessors
            ligatures (bool): if True, use phonetic ligatures for affricates
                              instead of standard IPA
            rev (bool): if True, load reverse transliteration
            rev_preproc (bool): if True, apply preprocessor when reverse transliterating
            rev_postproc (bool): if True, apply postprocessor when reverse transliterating
        """
        self.rev = rev
        self.g2p = self._load_g2p_map(code, False)
        self.regexp = self._construct_regex(self.g2p.keys())
        self.puncnorm = PuncNorm()
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
        self.preprocessor = PrePostProcessor(code, 'pre', False)
        self.postprocessor = PrePostProcessor(code, 'post', False)
        self.strip_diacritics = StripDiacritics(code)
        self.preproc = preproc
        self.postproc = postproc
        self.ligatures = ligatures
        self.rev_preproc = rev_preproc
        self.rev_postproc = rev_postproc
        if rev:
            self.rev_g2p = self._load_g2p_map(code, True)
            self.rev_regexp = self._construct_regex(self.rev_g2p.keys())
            self.rev_preprocessor = PrePostProcessor(code, 'pre', True)
            self.rev_postprocessor = PrePostProcessor(code, 'post', True)

        self.nils = defaultdict(int)

示例#2

显示文件

文件： simple.py 项目： dmort27/epitran

    def __init__(self, code, preproc=True, postproc=True, ligatures=False,
                 rev=False, rev_preproc=True, rev_postproc=True):
        """Constructs the backend object epitran uses for most languages

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            preproc (bool): if True, apply preprocessor
            postproc (bool): if True, apply postprocessors
            ligatures (bool): if True, use phonetic ligatures for affricates
                              instead of standard IPA
            rev (bool): if True, load reverse transliteration
            rev_preproc (bool): if True, apply preprocessor when reverse transliterating
            rev_postproc (bool): if True, apply postprocessor when reverse transliterating
        """
        self.rev = rev
        self.g2p = self._load_g2p_map(code, False)
        self.regexp = self._construct_regex(self.g2p.keys())
        self.puncnorm = PuncNorm()
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
        self.preprocessor = PrePostProcessor(code, 'pre', False)
        self.postprocessor = PrePostProcessor(code, 'post', False)
        self.strip_diacritics = StripDiacritics(code)
        self.preproc = preproc
        self.postproc = postproc
        self.ligatures = ligatures
        self.rev_preproc = rev_preproc
        self.rev_postproc = rev_postproc
        if rev:
            self.rev_g2p = self._load_g2p_map(code, True)
            self.rev_regexp = self._construct_regex(self.rev_g2p.keys())
            self.rev_preprocessor = PrePostProcessor(code, 'pre', True)
            self.rev_postprocessor = PrePostProcessor(code, 'post', True)

        self.nils = defaultdict(int)

示例#3

显示文件

    def __init__(self, lang_script_codes, cedict_file=None):
        """Construct a Backoff objectself.

        Args:
            lang_script_codes (list): codes for languages to try, starting
            with the highest priority languages
            cedict_file (str): path to the CC-CEdict dictionary file
            (necessary only when cmn-Hans or cmn-Hant are used)
        """
        self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
                      for c in lang_script_codes]
        self.num_re = re.compile(r'\p{Number}+')
        self.ft = panphon.featuretable.FeatureTable()
        self.xsampa = XSampa()
        self.puncnorm = PuncNorm()
        self.dias = [StripDiacritics(c) for c in lang_script_codes]

示例#4

显示文件

文件： simple.py 项目： yoks/epitran

class SimpleEpitran(object):
    def __init__(self, code, preproc=True, postproc=True, ligatures=False):
        """Constructs the backend object epitran uses for most languages

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            preproc (bool): if True, apply preprocessor
            postproc (bool): if True, apply postprocessors
            ligatures (bool): if True, use phonetic ligatures for affricates
                              instead of standard IPA
        """
        self.g2p = self._load_g2p_map(code)
        self.regexp = self._construct_regex()
        self.puncnorm = PuncNorm()
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
        self.preprocessor = PrePostProcessor(code, 'pre')
        self.postprocessor = PrePostProcessor(code, 'post')
        self.strip_diacritics = StripDiacritics(code)
        self.preproc = preproc
        self.postproc = postproc
        self.ligatures = ligatures
        self.nils = defaultdict(int)

    def __enter__(self):
        return self

    def __exit__(self, type_, val, tb):
        for nil, count in self.nils.items():
            sys.stderr.write('Unknown character "{}" occured {} times.\n'.format(nil, count))

    def _one_to_many_gr_by_line_map(self, gr_by_line):
        for g, ls in gr_by_line.items():
            if len(ls) != 1:
                return (g, ls)
        return None

    def _load_g2p_map(self, code):
        """Load the code table for the specified language.

        Args:
            code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the
                        language/script to be loaded
        """
        g2p = defaultdict(list)
        gr_by_line = defaultdict(list)
        try:
            path = os.path.join('data', 'map', code + '.csv')
            path = pkg_resources.resource_filename(__name__, path)
        except IndexError:
            raise DatafileError('Add an appropriately-named mapping to the data/maps directory.')
        with open(path, 'rb') as f:
            reader = csv.reader(f, encoding='utf-8')
            next(reader)
            for (i, fields) in enumerate(reader):
                try:
                    graph, phon = fields
                except ValueError:
                    raise DatafileError('Map file is not well formed at line {}.'.format(i + 2))
                graph = unicodedata.normalize('NFC', graph)
                phon = unicodedata.normalize('NFC', phon)
                g2p[graph].append(phon)
                gr_by_line[graph].append(i)
        if self._one_to_many_gr_by_line_map(g2p):
            graph, lines = self._one_to_many_gr_by_line_map(gr_by_line)
            lines = [l + 2 for l in lines]
            raise MappingError('One-to-many G2P mapping for "{}" on lines {}'.format(graph, ', '.join(map(str, lines))).encode('utf-8'))
        return g2p

    def _load_punc_norm_map(self):
        """Load the map table for normalizing 'down' punctuation."""
        path = os.path.join('data', 'puncnorm.csv')
        path = pkg_resources.resource_filename(__name__, path)
        with open(path, 'rb') as f:
            reader = csv.reader(f, encoding='utf-8',
                                delimiter=str(','),
                                quotechar=str('"'))
            next(reader)
            return {punc: norm for (punc, norm) in reader}

    def _construct_regex(self):
        """Build a regular expression that will greadily match segments from
           the mapping table.
        """
        graphemes = sorted(self.g2p.keys(), key=len, reverse=True)
        return re.compile(r'({})'.format(r'|'.join(graphemes)), re.I)

    def general_trans(self, text, filter_func,
                      normpunc=False, ligatures=False):
        """Transliaterates a word into IPA, filtering with filter_func

        Args:
            text (str): word to transcribe; unicode strings
            filter_func (function): function for filtering segments; takes
                                    a <segment, is_ipa> tuple and returns a
                                    boolean.
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of
                              standard IPA

        Returns:
            unicode: IPA string, filtered by filter_func.
        """
        text = unicode(text)
        text = self.strip_diacritics.process(text)
        text = unicodedata.normalize('NFC', text.lower())
        if self.preproc:
            text = self.preprocessor.process(text)
        tr_list = []
        while text:
            m = self.regexp.match(text)
            if m:
                source = m.group(0)
                try:
                    target = self.g2p[source][0]
                except KeyError:
                    logging.debug("source = '{}'".format(source))
                    logging.debug("self.g2p[source] = '{}'"
                                  .format(self.g2p[source]))
                    target = source
                tr_list.append((target, True))
                text = text[len(source):]
            else:
                tr_list.append((text[0], False))
                self.nils[text[0]] += 2
                text = text[1:]
        text = ''.join([s for (s, _) in filter(filter_func, tr_list)])
        if self.postproc:
            text = self.postprocessor.process(text)
        if ligatures or self.ligatures:
            text = ligaturize(text)
        if normpunc:
            text = self.puncnorm.norm(text)
        return text

    def transliterate(self, text, normpunc=False, ligatures=False):
        """Transliterates/transcribes a word into IPA

        Passes unmapped characters through to output unchanged.

        Args:
            word (str): word to transcribe; unicode string
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of standard IPA

        Returns:
            unicode: IPA string with unrecognized characters included
        """
        return self.general_trans(text, lambda x: True,
                                  normpunc, ligatures)

    def strict_trans(self, text, normpunc=False, ligatures=False):
        """Transliterates/transcribes a word into IPA

        Ignores umapped characters.

        Args:
            word (str): word to transcribe; unicode string
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of standard IPA

        Returns:
            unicode: IPA string
        """
        return self.general_trans(text, lambda x: x[1],
                                  normpunc, ligatures)

    def word_to_tuples(self, word, normpunc=False):
        """Given a word, returns a list of tuples corresponding to IPA segments.

        Args:
            word (unicode): word to transliterate
            normpunc (bool): If True, normalizes punctuation to ASCII inventory

        Returns:
            list: A list of (category, lettercase, orthographic_form,
                  phonetic_form, feature_vectors) tuples.

        The "feature vectors" form a list consisting of (segment, vector)
        pairs. For IPA segments, segment is a substring of phonetic_form such
        that the concatenation of all segments in the list is equal to
        the phonetic_form. The vectors are a sequence of integers drawn from
        the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0',
        and 1 corresponds to '+'.
        """
        def cat_and_cap(c):
            cat, case = tuple(unicodedata.category(c))
            case = 1 if case == 'u' else 0
            return unicode(cat), case

        def recode_ft(ft):
            try:
                return {'+': 1, '0': 0, '-': -1}[ft]
            except KeyError:
                return None

        def vec2bin(vec):
            return list(map(recode_ft, vec))

        def to_vector(seg):
            return seg, vec2bin(self.ft.segment_to_vector(seg))

        def to_vectors(phon):
            if phon == '':
                return [(-1, [0] * self.num_panphon_fts)]
            else:
                return [to_vector(seg) for seg in self.ft.ipa_segs(phon)]

        tuples = []
        word = unicode(word)
        word = self.strip_diacritics.process(word)
        word = unicodedata.normalize('NFC', word)
        if self.preproc:
            word = self.preprocessor.process(word)
        while word:
            match = self.regexp.match(word)
            if match:
                span = match.group(1)
                cat, case = cat_and_cap(span[0])
                phon = self.g2p[span.lower()][0]
                vecs = to_vectors(phon)
                tuples.append(('L', case, span, phon, vecs))
                word = word[len(span):]
            else:
                span = word[0]
                span = self.puncnorm.norm(span) if normpunc else span
                cat, case = cat_and_cap(span)
                cat = 'P' if normpunc and cat in self.puncnorm else cat
                phon = ''
                vecs = to_vectors(phon)
                tuples.append((cat, case, span, phon, vecs))
                word = word[1:]
        return tuples

    def ipa_segs(self, ipa):
        """Given an IPA string, decompose it into a list of segments

        Args:
            ipa (unicode): a Unicode IPA string

        Returns:
            list: a list of unicode strings corresponding to segments
                  (consonants and vowels) in the input string
        """
        return self.ft.segs(ipa)

示例#5

显示文件

文件： simple.py 项目： dmort27/epitran

class SimpleEpitran(object):
    def __init__(self, code, preproc=True, postproc=True, ligatures=False,
                 rev=False, rev_preproc=True, rev_postproc=True):
        """Constructs the backend object epitran uses for most languages

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            preproc (bool): if True, apply preprocessor
            postproc (bool): if True, apply postprocessors
            ligatures (bool): if True, use phonetic ligatures for affricates
                              instead of standard IPA
            rev (bool): if True, load reverse transliteration
            rev_preproc (bool): if True, apply preprocessor when reverse transliterating
            rev_postproc (bool): if True, apply postprocessor when reverse transliterating
        """
        self.rev = rev
        self.g2p = self._load_g2p_map(code, False)
        self.regexp = self._construct_regex(self.g2p.keys())
        self.puncnorm = PuncNorm()
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
        self.preprocessor = PrePostProcessor(code, 'pre', False)
        self.postprocessor = PrePostProcessor(code, 'post', False)
        self.strip_diacritics = StripDiacritics(code)
        self.preproc = preproc
        self.postproc = postproc
        self.ligatures = ligatures
        self.rev_preproc = rev_preproc
        self.rev_postproc = rev_postproc
        if rev:
            self.rev_g2p = self._load_g2p_map(code, True)
            self.rev_regexp = self._construct_regex(self.rev_g2p.keys())
            self.rev_preprocessor = PrePostProcessor(code, 'pre', True)
            self.rev_postprocessor = PrePostProcessor(code, 'post', True)

        self.nils = defaultdict(int)

    def __enter__(self):
        return self

    def __exit__(self, type_, val, tb):
        for nil, count in self.nils.items():
            sys.stderr.write('Unknown character "{}" occured {} times.\n'.format(nil, count))

    def _one_to_many_gr_by_line_map(self, gr_by_line):
        for g, ls in gr_by_line.items():
            if len(ls) != 1:
                return (g, ls)
        return None

    def _load_g2p_map(self, code, rev):
        """Load the code table for the specified language.

        Args:
            code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the
                        language/script to be loaded
            rev (boolean): True for reversing the table (for reverse transliterating)
        """
        g2p = defaultdict(list)
        gr_by_line = defaultdict(list)
        code += '_rev' if rev else ''
        try:
            path = os.path.join('data', 'map', code + '.csv')
            path = pkg_resources.resource_filename(__name__, path)
        except IndexError:
            raise DatafileError('Add an appropriately-named mapping to the data/maps directory.')
        with open(path, 'rb') as f:
            reader = csv.reader(f, encoding='utf-8')
            next(reader)
            for (i, fields) in enumerate(reader):
                try:
                    graph, phon = fields
                except ValueError:
                    raise DatafileError('Map file is not well formed at line {}.'.format(i + 2))
                graph = unicodedata.normalize('NFC', graph)
                phon = unicodedata.normalize('NFC', phon)
                g2p[graph].append(phon)
                gr_by_line[graph].append(i)
        if self._one_to_many_gr_by_line_map(g2p):
            graph, lines = self._one_to_many_gr_by_line_map(gr_by_line)
            lines = [l + 2 for l in lines]
            raise MappingError('One-to-many G2P mapping for "{}" on lines {}'.format(graph, ', '.join(map(str, lines))).encode('utf-8'))
        return g2p

    def _load_punc_norm_map(self):
        """Load the map table for normalizing 'down' punctuation."""
        path = os.path.join('data', 'puncnorm.csv')
        path = pkg_resources.resource_filename(__name__, path)
        with open(path, 'rb') as f:
            reader = csv.reader(f, encoding='utf-8',
                                delimiter=str(','),
                                quotechar=str('"'))
            next(reader)
            return {punc: norm for (punc, norm) in reader}

    def _construct_regex(self, g2p_keys):
        """Build a regular expression that will greadily match segments from
           the mapping table.
        """
        graphemes = sorted(g2p_keys, key=len, reverse=True)
        return re.compile(r'({})'.format(r'|'.join(graphemes)), re.I)

    def general_trans(self, text, filter_func,
                      normpunc=False, ligatures=False):
        """Transliaterates a word into IPA, filtering with filter_func

        Args:
            text (str): word to transcribe; unicode strings
            filter_func (function): function for filtering segments; takes
                                    a <segment, is_ipa> tuple and returns a
                                    boolean.
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of
                              standard IPA

        Returns:
            unicode: IPA string, filtered by filter_func.
        """
        text = unicode(text)
        text = self.strip_diacritics.process(text)
        text = unicodedata.normalize('NFC', text.lower())
        if self.preproc:
            text = self.preprocessor.process(text)
        tr_list = []
        while text:
            m = self.regexp.match(text)
            if m:
                source = m.group(0)
                try:
                    target = self.g2p[source][0]
                except KeyError:
                    logging.debug("source = '{}'".format(source))
                    logging.debug("self.g2p[source] = '{}'"
                                  .format(self.g2p[source]))
                    target = source
                tr_list.append((target, True))
                text = text[len(source):]
            else:
                tr_list.append((text[0], False))
                self.nils[text[0]] += 2
                text = text[1:]
        text = ''.join([s for (s, _) in filter(filter_func, tr_list)])
        if self.postproc:
            text = self.postprocessor.process(text)
        if ligatures or self.ligatures:
            text = ligaturize(text)
        if normpunc:
            text = self.puncnorm.norm(text)
        return text

    def transliterate(self, text, normpunc=False, ligatures=False):
        """Transliterates/transcribes a word into IPA

        Passes unmapped characters through to output unchanged.

        Args:
            word (str): word to transcribe; unicode string
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of standard IPA

        Returns:
            unicode: IPA string with unrecognized characters included
        """
        return self.general_trans(text, lambda x: True,
                                  normpunc, ligatures)

    def general_reverse_trans(self, ipa):
        """Reconstructs word from IPA. Does the reverse of transliterate().
        Ignores unmapped characters.

            Args:
                ipa (str): word transcription in ipa; unicode string

            Returns:
                unicode: reconstructed word
        """
        text = unicode(ipa)
        if self.rev_preproc:
            text = self.rev_preprocessor.process(text)
        tr_list = []
        while text:
            m = self.rev_regexp.match(text)
            if m:
                source = m.group(0)
                try:
                    target = self.rev_g2p[source][0]
                except KeyError:
                    logging.debug("source = '{}'".format(source))
                    logging.debug("self.rev_g2p[source] = '{}'"
                                  .format(self.g2p[source]))
                    target = source
                tr_list.append((target, True))
                text = text[len(source):]
            else:
                tr_list.append((text[0], False))
                self.nils[text[0]] += 2
                text = text[1:]
        text = ''.join([s for (s, _) in tr_list])
        if self.rev_postproc:
            text = self.rev_postprocessor.process(text)
        return text

    def reverse_transliterate(self, ipa):
        """Reconstructs word from IPA. Does the reverse of transliterate()

        Args:
            ipa (str): word transcription in ipa; unicode string

        Returns:
            unicode: reconstructed word
        """
        if not self.rev:
            raise ValueError('This Epitran object was initialized with no reverse transliteration loaded')
        return self.general_reverse_trans(ipa)

    def strict_trans(self, text, normpunc=False, ligatures=False):
        """Transliterates/transcribes a word into IPA

        Ignores umapped characters.

        Args:
            word (str): word to transcribe; unicode string
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of standard IPA

        Returns:
            unicode: IPA string
        """
        return self.general_trans(text, lambda x: x[1],
                                  normpunc, ligatures)

    def word_to_tuples(self, word, normpunc=False):
        """Given a word, returns a list of tuples corresponding to IPA segments.

        Args:
            word (unicode): word to transliterate
            normpunc (bool): If True, normalizes punctuation to ASCII inventory

        Returns:
            list: A list of (category, lettercase, orthographic_form,
                  phonetic_form, feature_vectors) tuples.

        The "feature vectors" form a list consisting of (segment, vector)
        pairs. For IPA segments, segment is a substring of phonetic_form such
        that the concatenation of all segments in the list is equal to
        the phonetic_form. The vectors are a sequence of integers drawn from
        the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0',
        and 1 corresponds to '+'.
        """
        def cat_and_cap(c):
            cat, case = tuple(unicodedata.category(c))
            case = 1 if case == 'u' else 0
            return unicode(cat), case

        def recode_ft(ft):
            try:
                return {'+': 1, '0': 0, '-': -1}[ft]
            except KeyError:
                return None

        def vec2bin(vec):
            return list(map(recode_ft, vec))

        def to_vector(seg):
            return seg, vec2bin(self.ft.segment_to_vector(seg))

        def to_vectors(phon):
            if phon == '':
                return [(-1, [0] * self.num_panphon_fts)]
            else:
                return [to_vector(seg) for seg in self.ft.ipa_segs(phon)]

        tuples = []
        word = unicode(word)
        word = self.strip_diacritics.process(word)
        word = unicodedata.normalize('NFC', word)
        if self.preproc:
            word = self.preprocessor.process(word)
        while word:
            match = self.regexp.match(word)
            if match:
                span = match.group(1)
                cat, case = cat_and_cap(span[0])
                phon = self.g2p[span.lower()][0]
                vecs = to_vectors(phon)
                tuples.append(('L', case, span, phon, vecs))
                word = word[len(span):]
            else:
                span = word[0]
                span = self.puncnorm.norm(span) if normpunc else span
                cat, case = cat_and_cap(span)
                cat = 'P' if normpunc and cat in self.puncnorm else cat
                phon = ''
                vecs = to_vectors(phon)
                tuples.append((cat, case, span, phon, vecs))
                word = word[1:]
        return tuples

    def ipa_segs(self, ipa):
        """Given an IPA string, decompose it into a list of segments

        Args:
            ipa (unicode): a Unicode IPA string

        Returns:
            list: a list of unicode strings corresponding to segments
                  (consonants and vowels) in the input string
        """
        return self.ft.segs(ipa)