Пример #1
0
    def tags_for_symbol(self, symbol):
        tags = self.__data.get('tags') or {}
        symbol_tags = tags.get(
            symbol
        )  # TODO: just change wrap_as_list to check for None... but that would affect a LOT of other code
        if symbol_tags:
            symbol_tags = wrap_as_list(symbol_tags)
        else:
            symbol_tags = []

        if all(type(tag) is str for tag in symbol_tags):
            result = symbol_tags
        else:
            # a workaround to choose one of multiple tagsets from data
            assert (
                symbol_tags
            )  # all([]) == True, so an empty list should be in the other branch
            assert (all(type(tag) is list for tag in symbol_tags))

            # cache so that repeated queries are NOT inconsistent.
            # should be okay, since VERBSET_BANK re-instantiates for every query...
            if symbol not in self.__randomly_picked_symbol_tags:
                self.__randomly_picked_symbol_tags[symbol] = pick_random(
                    symbol_tags).copy()

            result = self.__randomly_picked_symbol_tags[symbol]

        return result  #tags.get(symbol)
Пример #2
0
    def __pre_or_post_words(self, lang, kind):
        assert (kind == 'prewords' or kind == 'postwords')

        raw_data = self.__data['langs'][lang].get(kind, {})
        assert (type(raw_data) is dict)

        result = {}
        for symbol, value in raw_data.items():
            if type(value) is str:
                result[symbol] = value
            elif type(value) is list:
                assert (all(type(subvalue) is str for subvalue in value))
                result[symbol] = pick_random(value)
            else:
                assert (type(value) is dict)  # YAML
                raise Exception('preword/postword data should be list or str')

        return result
Пример #3
0
    def word(self, lang):
        if lang not in self.__words:
            word_data = self._get_word_data(lang)
            if type(word_data) is str:
                self.__words[lang] = word_data
            elif type(
                    word_data
            ) is list:  # if wordset has multiple entries [man, person, ...], just pick one at random for WordSet's lifetime
                assert (word_data)
                assert (all(type(item) is str for item in word_data))
                self.__words[lang] = pick_random(word_data)
            else:
                assert (type(word_data) is dict)  # YAML
                raise Exception(
                    'per-lang verb data should be string or list of strings')

        assert (type(self.__words[lang]) is str)
        return self.__words[lang]
Пример #4
0
    def transformation_for_symbol(self, symbol):
        transforms = self._data().get('transformations', {})
        symbol_transforms = transforms.get(symbol)
        if symbol_transforms:
            symbol_transforms = wrap_as_list(symbol_transforms)
        else:
            symbol_transforms = []

        if all(type(transform) is str for transform in symbol_transforms):
            result = symbol_transforms
        else:
            assert (symbol_transforms)
            assert (all(
                type(transform) is list for transform in symbol_transforms))

            if symbol not in self.__randomly_picked_symbol_transformations:
                self.__randomly_picked_symbol_transformations[
                    symbol] = pick_random(symbol_transforms).copy()

            result = self.__randomly_picked_symbol_transformations[symbol]

        return result  #transforms.get(symbol)
Пример #5
0
    def tags(self):
        # don't have to worry about multiple synsets here, since semantic tags apply to all words in the synset
        if self.__tags is None:
            tag_data = self._data().get('tags')
            if tag_data:
                if type(tag_data) is str:
                    self.__tags = [tag_data]
                elif type(tag_data) is list:
                    if all(type(item) is str for item in tag_data):
                        self.__tags = tag_data  # single-list
                    elif all(type(item) is list for item in tag_data):
                        self.__tags = pick_random(tag_data)  # list of lists
                    else:
                        assert (all(type(item) is dict) for item in tag_data)
                        raise Exception('malformed tag list', tags)
                else:
                    assert (type(tags) is dict)  # YAML
                    raise Exception('malformed tags', tags)
            else:
                self.__tags = [
                ]  # if you try to return wrap(data.get('tags', [])), you can wind up with [[]]

        assert (all(type(tag) is str for tag in self.__tags))
        return self.__tags
Пример #6
0
 def ppform(self, lang):
     if lang not in self.__ppform:
         forms_for_all_langs = self._data().get('ppforms', {})
         forms = wrap_as_list(forms_for_all_langs.get(lang, ['standard']))
         self.__ppform[lang] = pick_random(forms)
     return self.__ppform[lang]
Пример #7
0
    def _generate_determiner(self, node):
        lexical_targets = node.lexical_targets()
        assert (len(lexical_targets) is 1)
        assert (lexical_targets[0].type() == 'noun')

        words = self._get_det_base(
            node)  # string instead of list, to enable segmentation antics

        target = lexical_targets[0]
        assert (
            'object' in target._get_option('tags')
        )  # so that "DT 些" has a plural meaning (*这 些 水). so horribly brittle...

        if target.number() == 'singular':
            noun = self._get_noun_base(target)
            noun_form = self._noun_form_bank.get(noun)

            # workaround to allow skipping some less important data entry for now
            if utility.CHECK_DATABASE:
                assert (
                    noun_form
                )  # would call None.get() if noun is missing from nouns_zh.yml
                measure_words_from_file = noun_form.get('M', '个')
            else:
                if noun_form:
                    measure_words_from_file = noun_form.get('M', '个')
                else:
                    assert (noun_form is None)
                    measure_words_from_file = '个'

            # TODO: allow measure word omission (e.g. 这 世界 - only allowed for some words?)
            if measure_words_from_file == '个':
                measure_word = '个'
            else:
                # this should occur here and not in NounSet, because it's zh-specific, and I'm trying to keep all language-specific code in Generators
                # but unfortunately, it's also data-specific code...
                if type(measure_words_from_file) is str:
                    candidates = [measure_words_from_file]
                elif type(measure_words_from_file) is list:
                    assert (all(
                        type(item) is str for item in measure_words_from_file))
                    candidates = measure_words_from_file
                else:
                    assert (type(measure_words_from_file)
                            in [dict, type(None)]
                            )  # I suppose it could be a number or a bool...
                    raise Exception('M: expected str or list (YAML)',
                                    noun_form.get('pinyin'))

                if utility.rand() <= 0.9:
                    measure_word = utility.pick_random(
                        measure_words_from_file
                    )  # allows multiple M's per word
                else:
                    measure_word = '个'

            assert (type(measure_word) is str)
            words += ' ' + measure_word

        else:
            words += '些'

        self._generate_node_text(node, words)