Пример #1
0
def orthography(args):  # pragma: no cover
    ds = get_dataset(args)
    out = ds.dir.joinpath('orthography.tsv')
    if out.exists():
        if not confirm(
                'There already is an orthography profile for this dataset. Overwrite?',
                default=False):
            return

    graphemes = Counter()
    for line in ds.iter_raw_lexemes():
        graphemes.update(grapheme_pattern.findall(line))

    with UnicodeWriter(out, delimiter='\t') as writer:
        writer.writerow(['graphemes', 'frequency', 'IPA'])
        for grapheme, frequency in graphemes.most_common():
            writer.writerow([grapheme, '{0}'.format(frequency), grapheme])

    log_dump(out, log=args.log)
Пример #2
0
    def grapheme_clusters(self, word):
        """
        See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
        http://www.unicode.org/reports/tr29/

        Given a string as input, return a list of Unicode graphemes using the
        "\X" regular expression.

        Parameters
        ----------
        word : str
            A Unicode string to be tokenized into graphemes.

        Returns
        -------
        result : list
            List of Unicode graphemes in NFD.

        """
        # init the regex Unicode grapheme cluster match
        return grapheme_pattern.findall(word)
Пример #3
0
    def from_text(cls, text, mapping='mapping'):
        """
        Create a Profile instance from the Unicode graphemes found in `text`.

        Parameters
        ----------
        text
        mapping

        Returns
        -------
        A Profile instance.

        """
        graphemes = Counter(grapheme_pattern.findall(text))
        specs = [
            OrderedDict([(cls.GRAPHEME_COL, grapheme),
                         ('frequency', frequency), (mapping, grapheme)])
            for grapheme, frequency in graphemes.most_common()
        ]
        return cls(*specs)