Пример #1
0
    def __str__(self):
        """
        Return the reference representation of the sound.

        Note
        ----
        We first try to return the non-alias value in our data. If this fails,
        we create the sound based on it's feature representation.
        """
        # generated sounds need to be re-produced for double-checking
        if not self.generated:
            if not self.alias and self.grapheme in self.ts.sounds:
                return self.grapheme
            elif self.alias and self.featureset in self.ts.features:
                return str(self.ts.features[self.featureset])
            # this can usually not happen, as we catch these errors when loading a ts!
            raise ValueError('Orphaned alias {0}'.format(
                self.grapheme))  # pragma: no cover

        # search for best base-string
        elements = [f for f in self._features() if f not in EXCLUDE_FEATURES
                    ] + [self.type]
        base_str = self.base or '<?>'
        base_graphemes = []
        while elements:
            base = self.ts.features.get(frozenset(elements))
            if base:
                base_graphemes.append(base.grapheme)
            elements.pop(0)
        base_str = base_graphemes[-1] if base_graphemes else base_str or '<?>'
        base_vals = {
            self.ts._feature_values[elm] for elm in
            self.ts.sounds[base_str].name.split(' ')[:-1]} if \
            base_str != '<?>' else {}
        out = []
        for p in self._write_order['pre']:
            if p not in base_vals and getattr(self, p, '') in self._features():
                out.append(
                    norm(self.ts.features[self.type].get(
                        getattr(self, p, ''), '<!>')))
        out.append(base_str)
        for p in self._write_order['post']:
            if p not in base_vals and getattr(self, p, '') in self._features():
                out.append(
                    norm(self.ts.features[self.type].get(
                        getattr(self, p, ''), '<!>')))
        return ''.join(out)
Пример #2
0
 def _norm(self, string):
     """Extended normalization: normalize by list of norm-characers, split
     by character "/"."""
     nstring = norm(string)
     if "/" in string:
         s, t = string.split('/')
         nstring = t
     return self.normalize(nstring)
Пример #3
0
    def __init__(self, id_):
        """
        :param system: The name of a transcription system or a directory containing one.
        """
        if hasattr(self, 'features'):
            # Only initialize, if this is really a new instance!
            return
        assert id_
        system = pkg_path('transcriptionsystems', id_)
        if not (system.exists() and system.is_dir()):
            raise ValueError('unknown system: {0}'.format(id_))

        self.system = TableGroup.from_file(
            pkg_path('transcriptionsystems',
                     'transcription-system-metadata.json'))
        self.system._fname = system / 'metadata.json'

        self.features = {'consonant': {}, 'vowel': {}, 'tone': {}}
        # dictionary for feature values, checks when writing elements from
        # write_order to make sure no output is doubled
        self._feature_values = {}

        # load the general features
        features = jsonlib.load(
            pkg_path('transcriptionsystems', 'features.json'))

        self.diacritics = dict(consonant={},
                               vowel={},
                               click={},
                               diphthong={},
                               tone={},
                               cluster={})
        for dia in itertable(self.system.tabledict['diacritics.tsv']):
            if not dia['alias'] and not dia['typography']:
                self.features[dia['type']][dia['value']] = dia['grapheme']
            # assign feature values to the dictionary
            self._feature_values[dia['value']] = dia['feature']
            self.diacritics[dia['type']][dia['grapheme']] = dia['value']

        self.sound_classes = {}
        self.columns = {}  # the basic column structure, to allow for rendering
        self.sounds = {}  # Sounds by grapheme
        self._covered = {}
        # check for unresolved aliased sounds
        aliases = []
        for cls in [Consonant, Vowel, Tone, Marker]:  # noqa: F405
            type_ = cls.__name__.lower()
            self.sound_classes[type_] = cls
            # store information on column structure to allow for rendering of a
            # sound in this form, which will make it easier to insert it when
            # finding generated sounds
            self.columns[type_] = [
                c['name'].lower()
                for c in self.system.tabledict['{0}s.tsv'.format(
                    type_)].asdict()['tableSchema']['columns']
            ]
            for l, item in enumerate(
                    itertable(
                        self.system.tabledict['{0}s.tsv'.format(type_)])):
                if item['grapheme'] in self.sounds:
                    raise ValueError(
                        'duplicate grapheme in {0}:{1}: {2}'.format(
                            type_ + 's.tsv', l + 2, item['grapheme']))
                sound = cls(ts=self, **item)
                # make sure this does not take too long
                for key, value in item.items():
                    if key not in {'grapheme', 'note', 'alias'} and \
                            value and value not in self._feature_values:
                        self._feature_values[value] = key
                        if type_ != 'marker' and value not in features[type_][
                                key]:
                            raise ValueError(
                                "Unrecognized features ({0}: {1}, line {2}))".
                                format(key, value, l + 2))

                self.sounds[item['grapheme']] = sound
                if not sound.alias:
                    if sound.featureset in self.features:
                        raise ValueError(
                            'duplicate features in {0}:{1}: {2}'.format(
                                type_ + 's.tsv', l + 2, sound.name))
                    self.features[sound.featureset] = sound
                else:
                    aliases += [(l, sound.type, sound.featureset)]
        # check for consistency of aliases: if an alias has no counterpart, it
        # is orphaned and needs to be deleted or given an accepted non-aliased
        # sound
        if [x for x in aliases
                if x[2] not in self.features]:  # pragma: no cover
            error = ', '.join(
                text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases
                if x[2] not in self.features)
            raise ValueError('Orphaned aliases in line(s) {0}'.format(error))

        # basic regular expression, used to match the basic sounds in the system.
        self._regex = None
        self._update_regex()

        # normalization data
        self._normalize = {
            norm(r['source']): norm(r['target'])
            for r in itertable(self.system.tabledict['normalize.tsv'])
        }