Пример #1
0
    def test_Example(self):
        from dictionaria.lib.ingest import Example

        e = Example([('tx', 'a'), ('ft', 'b')])
        id_ = e.id
        e.set('ref', 'x')
        self.assertNotEquals(id_, e.id)
Пример #2
0
    def __call__(self, entry):
        example = None
        lx = None
        rf = None
        items = []

        for marker, content in entry:
            if marker == 'lx':
                lx = content

            if marker in self.example_props:
                if marker == 'rf':
                    rf = content
                elif marker == 'xv':
                    # new example starts
                    if example:
                        # but last one is unfinished
                        self.log.write(
                            '# incomplete example in lx: %s - missing xe:\n%s\n\n'
                            % (lx, example))
                    example = Example([('tx', content)])
                elif marker == 'xe':
                    # example ends
                    if example:
                        if rf:
                            example.insert(0, ('rf', rf))
                        example.append(('ft', content))
                        items.append(('xref', self.xref(example)))
                        rf = None
                        example = None
                    else:
                        self.log.write(
                            '# incomplete example in lx: %s - missing xv\n' % lx)
                else:
                    if not example:
                        self.log.write('incomplete example in lx: %s - missing xv\n' % lx)
                    else:
                        example.append((self.example_props[marker], content))
            else:
                items.append((marker, content))
        return entry.__class__(items)
Пример #3
0
    def get_words(self):
        """
        :return: generator for the words contained within the entry.
        """
        word = None
        # if an entry has only one \ps marker but multiple words, the value of \ps is used
        # as part-of-speech for all words.
        pos = None

        example = None
        meaning = None

        # flag signaling whether we are dealing with the first meaning of a word or
        # subsequent ones.
        first_meaning = True

        # now we loop over the (marker, value) pairs of the entry:
        for k, v in self:
            # individual words are identified by \lx or \se (sub-entry) markers.
            if k in ['lx', 'se']:
                if word:
                    yield self.checked_word(word, meaning)
                word = Word(v)
                if pos:
                    word.ps = pos
                meaning = Meaning()
            elif k == 'sn':  # a new sense number: initialize a new Meaning.
                if not first_meaning:
                    self.checked_word(word, meaning)
                    meaning = Meaning()
                first_meaning = False
            # meaning-specific markers:
            elif k in ['de', 'ge']:
                # FIXME: we must support multiple meanings expressed by
                # semicolon-separated \ge values, e.g. "jump ; jump at"
                setattr(meaning, k, v)
            elif k == 'sd':
                meaning.sd.append(v)
            elif k == 'xv':
                if example:
                    example.xv += ' %s' % v
                else:
                    example = Example(v)
            elif k in ['xvm', 'xeg']:
                if getattr(example, k):
                    v = getattr(example, k) + ' ' + v
                setattr(example, k, v)
            elif k == 'xe':
                if example:
                    example.xe = v
                    try:
                        assert meaning
                        meaning.x.append(example)
                    except AssertionError:
                        print(
                            'no meanings for (sense or subentry of) word %s' % word.form)
                    example = None
                else:
                    print('xe without xv for word %s' % word.form)
            elif k == 'xref':
                meaning.xref.append(v)
            # word-specific markers:
            elif k in ['hm', 'ph']:
                if getattr(word, k) is None:
                    # only record first occurrence of the marker!
                    setattr(word, k, v)
            elif k == 'ps':
                pos = word.ps = v
            elif k in ['cf', 'mn']:
                for vv in v.split(','):
                    if vv.strip():
                        word.rel.append((k, vv.strip()))
            elif k == 'gxx':
                word.non_english_meanings[k].extend(sfm.FIELD_SPLITTER_PATTERN.split(v))
            else:
                word.data[k].append(v)
        if word:
            yield self.checked_word(word, meaning)