示例#1
0
    def _get_process_pron(
        self,
        stress: bool,
        syllable_boundaries: bool,
        segment: bool,
        tone: bool,
    ) -> Callable[[Pron], Pron]:
        processors = []
        if not stress:
            processors.append(functools.partial(re.sub, r"[ˈˌ]", ""))
        if not syllable_boundaries:
            processors.append(functools.partial(re.sub, r"\.", ""))
        if not tone:
            processors.append(functools.partial(re.sub, _PARENS_REGEX, ""))
            processors.append(functools.partial(re.sub, _TONES_REGEX, ""))
        if segment:
            processors.append(functools.partial(segments.Tokenizer(),
                                                ipa=True))
        prosodic_markers = frozenset(["ˈ", "ˌ", "."])

        def wrapper(pron):
            for processor in processors:
                pron = processor(pron)
            # GH-59: Skip prons that are empty, or have only stress marks or
            # syllable boundaries.
            if any(ch not in prosodic_markers for ch in pron):
                return pron

        return wrapper
示例#2
0
    def __init__(self, language, logger=get_logger()):
        self.logger = logger
        self.logger.info('initializing backend %s-%s', self.name(),
                         self.version())

        profile = self._load_g2p_profile(language)
        self.tokenizer = segments.Tokenizer(profile=profile)
示例#3
0
    def _init_language(self, language):
        # load the grapheme to phoneme mapping
        profile = self._load_g2p_profile(language)
        self._tokenizer = segments.Tokenizer(profile=profile)

        # this is the language code
        return pathlib.Path(language).stem
示例#4
0
    def _get_process_pron(self, no_stress: bool, no_syllable_boundaries: bool,
                          no_segment: bool) -> Callable[[Pron], Pron]:
        # segments v2.1.2 oddly sets a global logging configuration
        # that interferes with downstream logging.
        # See: https://github.com/cldf/segments/issues/47
        import segments

        processors = []
        if no_stress:
            processors.append(functools.partial(re.sub, r"[ˈˌ]", ""))
        if no_syllable_boundaries:
            processors.append(functools.partial(re.sub, r"\.", ""))
        if not no_segment:
            processors.append(functools.partial(segments.Tokenizer(),
                                                ipa=True))
        prosodic_markers = frozenset(["ˈ", "ˌ", "."])

        def wrapper(pron):
            for processor in processors:
                pron = processor(pron)
            # GH-59: Skip prons that are empty, or have only stress marks or
            # syllable boundaries.
            if any(ch not in prosodic_markers for ch in pron):
                return pron

        return wrapper
示例#5
0
    def trim(self, ipa_col=IPA_COLUMN):
        # Make a copy of the profile (so we don't change in place)
        new_profile = collections.OrderedDict()
        for g, entry in self.graphemes.items():
            spec = copy.copy(entry)
            spec[self.GRAPHEME_COL] = g
            new_profile[g] = spec

        # Collect all keys, so that we will gradually remove them; those with
        # ^ and $ go first
        graphemes = list(new_profile.keys())
        bound_graphemes = [
            grapheme for grapheme in graphemes
            if grapheme[0] == "^" and grapheme[-1] == "$"
        ]
        bound_graphemes += [
            grapheme for grapheme in graphemes
            if grapheme[0] == "^" and grapheme[-1] != "$"
        ]
        bound_graphemes += [
            grapheme for grapheme in graphemes
            if grapheme[0] != "^" and grapheme[-1] == "$"
        ]

        check_graphemes = bound_graphemes + sorted(
            [
                grapheme for grapheme in graphemes
                if len(grapheme) > 1 and grapheme not in bound_graphemes
            ],
            key=len,
            reverse=True,
        )

        # For each entry, we will remove it from `segment_map`, apply the resulting
        # profile, and add the entry back at the end of loop (still expansive, but
        # orders of magnitude less expansive than making a copy at each iteration)
        removed = 0
        for grapheme in check_graphemes:
            if grapheme in new_profile:
                ipa = new_profile[grapheme][ipa_col]
                # Obtain the segments without the current rule
                t = segments.Tokenizer(profile=Profile(*[
                    copy.copy(s) for g, s in new_profile.items()
                    if g != grapheme
                ]))
                if t(grapheme, column=ipa_col) == ipa:
                    # If the resulting `segments` match the `ipa` reference, we can delete the rule:
                    removed += 1
                    del new_profile[grapheme]

        for g in set(self.graphemes.keys()) - set(new_profile.keys()):
            del self.graphemes[g]

        self.recreate_tree()
        return removed
示例#6
0
    def __init__(self, language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        self.logger = logger
        self.logger.info(
            'initializing backend %s-%s', self.name(), self.version())

        # load the grapheme to phoneme mapping
        profile = self._load_g2p_profile(language)
        self.tokenizer = segments.Tokenizer(profile=profile)

        # setup punctuation processing
        self.preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)
示例#7
0
 def augment(self, forms, clts=None, ipa_col=IPA_COLUMN):
     """
     Applies a profile to a wordlist, returning new profile counts and segments.
     """
     self.column_labels.add('FREQUENCY')
     if clts:
         self.column_labels.add('SCA')
     self.column_labels.add('EXAMPLES')
     freqs = collections.Counter()
     ex = collections.defaultdict(list)
     t = segments.Tokenizer(profile=self)
     for form in forms:
         graphemes = t(self.segmentable_form(form)).split()
         freqs.update(graphemes)
         for g in graphemes:
             ex[g].append(form[1:-1])
     for g, spec in self.graphemes.items():
         spec['FREQUENCY'] = freqs.get(g, 0)
         spec['EXAMPLES'] = ";".join(ex.get(g, [])[:5])
         if clts:
             spec['SCA'] = ipa2sca(spec[ipa_col], clts)
示例#8
0
from pathlib import Path

import pycldf
import pyclts
import segments
import cldfbench
import cldfcatalog

import lingpy
import lingpy.compare.partial

clts_path = cldfcatalog.Config.from_file().get_clone("clts")
clts = cldfbench.catalogs.CLTS(clts_path)
bipa = clts.api.bipa

tokenizer = segments.Tokenizer()


def sha1(path):
    return hashlib.sha1(str(path).encode("utf-8")).hexdigest()[:12]


def clean_segments(segment_string: t.List[str]) -> t.Iterable[pyclts.models.Symbol]:
    """Reduce the row's segments to not contain empty morphemes.

    This function removes all unknown sound segments (/0/) from the segments
    string it is passed, and removes empty morphemes by collapsing subsequent
    morpheme boundary markers (_#◦+→←) into one.

    >>> segments = "+ _ t a + 0 + a t"
    >>> c = clean_segments(segments)
示例#9
0
    def get_wordlist(
            self,
            doculect='base',
            profile=False,
            ref='crossid',
            lexstat=True,
            threshold=0.4):
        """
        Return a classical wordlist from the data.
        """
        if profile:
            profile = segments.Tokenizer(profile)
            tokenize = lambda x: profile('^' + x + '$', column='IPA').split()  # noqa: E731
        else:
            tokenize = lingpy.ipa2tokens

        D = {
            0: [
                'doculect',
                'concept',
                'concept_in_source',
                'concept_type',
                'form',
                'tokens',
                'occurrences',
                'word_forms',
                'gloss_forms',
                'phrase_example',
                'gloss_example',
                'references',
            ]
        }
        idx = 1
        for ctype in ['lexicon', 'grammar']:
            concepts = self.get_concepts(ctype=ctype)
            concordance = self._concordances[ctype]
            for concept, entries in concepts.items():
                for form, lid, cis, freq in entries:
                    # retrieve the concordance
                    pidx, sA, sB = concordance[form, concept, cis, lid][0]
                    txt = self[pidx].phrase
                    gls = self[pidx].gloss
                    word, fgls = self[pidx, sA]
                    tokens = tokenize(form)
                    references = ' '.join(
                        ['{0}:{1}:{2}'.format(a, b, c)
                         for a, b, c in concordance[form, concept, cis, lid]])
                    # check tokens
                    try:
                        lingpy.tokens2class(tokens, 'sca')
                        check = True
                    except:  # noqa: E722, # pragma: no cover
                        check = False
                    if concept.strip() and check:
                        D[idx] = [
                            doculect if self.monolingual else lid,
                            concept,
                            cis,
                            ctype,
                            form,
                            tokens,
                            freq,
                            word,
                            fgls,
                            txt,
                            gls,
                            references]
                        idx += 1
                    else:
                        print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format(
                            concept,
                            form,
                            tokens,
                            pidx,
                            sA,
                            sB,
                        ))
        wl = lingpy.Wordlist(D)

        if lexstat:
            wl = lingpy.LexStat(D)
            wl.cluster(method='sca', threshold=threshold, ref=ref)
        else:
            wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]])
            wl.renumber('cog', ref)
        return wl