Пример #1
0
class PinyinICUTest(NeedsDatabaseTest, unittest.TestCase):
    """Test Pinyin tonemark conversion on ICU transformation rule."""
    CONVERSION_DIRECTION = ('Pinyin', 'Pinyin')

    def setUp(self):
        NeedsDatabaseTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

        try:
            import PyICU

            self.toNumeric = PyICU.Transliterator.createInstance(
                "Latin-NumericPinyin", PyICU.UTransDirection.UTRANS_FORWARD)
            self.fromNumeric = self.toNumeric.createInverse()
        except ImportError:
            pass

    def testToneMarkPlacement(self):
        """Test Pinyin tonemark conversion on ICU transformation rule."""
        if not hasattr(self, 'toNumeric'):
            return

        for readingEntity in self.f.getReadingEntities('Pinyin'):
            if readingEntity in (u'hn\u0304g', u'h\u0144g', u'h\u0148g',
                u'h\u01f9g', u'n\u0304g', u'\u0144g', u'\u0148g',
                u'\u01f9g'):
                continue
            targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin',
                targetOptions={'toneMarkType': 'numbers',
                    'missingToneMark': 'fifth'})
            self.assertEquals(targetEntity,
                self.toNumeric.transliterate(readingEntity))

        for readingEntity in self.f.getReadingEntities('Pinyin',
            toneMarkType='numbers', missingToneMark='fifth'):
            if readingEntity in ('hng1', 'hng2', 'hng3', 'hng4', 'ng1', 'ng2',
                'ng3', 'ng4', u'ê1', u'ê2', u'ê3', u'ê4'):
                continue
            targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin',
                sourceOptions={'toneMarkType': 'numbers',
                    'missingToneMark': 'fifth'})
            self.assertEquals(targetEntity,
                self.fromNumeric.transliterate(readingEntity))
Пример #2
0
class CharacterLookupReadingMethodsTest(CharacterLookupTest,
                                        unittest.TestCase):
    """
    Runs consistency checks on the reading methods of the
    :class:`~cjklib.characterlookup.CharacterLookup` class.

    .. todo::
        * Impl: include script table from Unicode 5.2.0 to get character ranges
          for Hangul and Kana
    """
    DIALECTS = {}

    SPECIAL_ENTITY_LIST = {}

    def setUp(self):
        CharacterLookupTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

    def testReadingMappingAvailability(self):
        """
        Test if the readings under
        ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for
        conversion.
        """
        # mock to simulate availability of all tables in
        #   characterLookup.CHARARACTER_READING_MAPPING
        tables = [table for table, _ \
            in list(self.characterLookup.CHARARACTER_READING_MAPPING.values())]
        self.characterLookup.db.engine = EngineMock(
            self.characterLookup.db.engine, mockTables=tables)

        for reading in self.characterLookup.CHARARACTER_READING_MAPPING:
            # only if table exists
            table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[
                reading]

            self.assertTrue(
                self.characterLookup.hasMappingForReadingToCharacter(reading))
            self.assertTrue(
                self.characterLookup.hasMappingForCharacterToReading(reading))

        # test proper checking for all known readings
        for reading in self.f.getSupportedReadings():
            self.assertTrue(
                self.characterLookup.hasMappingForReadingToCharacter(reading) \
                in [True, False])
            self.assertTrue(
                self.characterLookup.hasMappingForCharacterToReading(reading) \
                in [True, False])

    @attr('slow')
    def testGetCharactersForReadingAcceptsAllEntities(self):
        """Test if ``getCharactersForReading`` accepts all reading entities."""
        for reading in self.f.getSupportedReadings():
            if not self.characterLookup.hasMappingForReadingToCharacter(
                    reading):
                continue

            dialects = [{}]
            if reading in self.DIALECTS:
                dialects.extend(self.DIALECTS[reading])

            for dialect in dialects:
                if hasattr(self.f.getReadingOperatorClass(reading),
                           'getReadingEntities'):
                    entities = self.f.getReadingEntities(reading, **dialect)
                elif reading in self.SPECIAL_ENTITY_LIST:
                    entities = self.SPECIAL_ENTITY_LIST[reading]
                else:
                    continue

                for entity in entities:
                    try:
                        results = self.characterLookup.getCharactersForReading(
                            entity, reading, **dialect)

                        self.assertEqual(type(results), type([]),
                            "Method getCharactersForReading() doesn't return" \
                                + " a list for entity %s " % repr(entity) \
                        + ' (reading %s, dialect %s)' % (reading, dialect))

                        for entry in results:
                            self.assertEqual(len(entry), 1,
                                "Entry %s in result for %s has length != 1" \
                                    % (repr(entry), repr(entity)) \
                                + ' (reading %s, dialect %s)' \
                                % (reading, dialect))
                    except exception.UnsupportedError:
                        pass
                    except exception.ConversionError:
                        pass
Пример #3
0
class CharacterLookupReadingMethodsTest(CharacterLookupTest, unittest.TestCase):
    """
    Runs consistency checks on the reading methods of the
    :class:`~cjklib.characterlookup.CharacterLookup` class.

    .. todo::
        * Impl: include script table from Unicode 5.2.0 to get character ranges
          for Hangul and Kana
    """
    DIALECTS = {}

    SPECIAL_ENTITY_LIST = {}

    def setUp(self):
        CharacterLookupTest.setUp(self)
        self.f = ReadingFactory(dbConnectInst=self.db)

    def testReadingMappingAvailability(self):
        """
        Test if the readings under
        ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for
        conversion.
        """
        # mock to simulate availability of all tables in
        #   characterLookup.CHARARACTER_READING_MAPPING
        tables = [table for table, _ \
            in self.characterLookup.CHARARACTER_READING_MAPPING.values()]
        self.characterLookup.db.engine = EngineMock(
                self.characterLookup.db.engine, mockTables=tables)

        for reading in self.characterLookup.CHARARACTER_READING_MAPPING:
            # only if table exists
            table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[reading]

            self.assert_(
                self.characterLookup.hasMappingForReadingToCharacter(reading))
            self.assert_(
                self.characterLookup.hasMappingForCharacterToReading(reading))

        # test proper checking for all known readings
        for reading in self.f.getSupportedReadings():
            self.assert_(
                self.characterLookup.hasMappingForReadingToCharacter(reading) \
                in [True, False])
            self.assert_(
                self.characterLookup.hasMappingForCharacterToReading(reading) \
                in [True, False])

    @attr('slow')
    def testGetCharactersForReadingAcceptsAllEntities(self):
        """Test if ``getCharactersForReading`` accepts all reading entities."""
        for reading in self.f.getSupportedReadings():
            if not self.characterLookup.hasMappingForReadingToCharacter(
                reading):
                continue

            dialects = [{}]
            if reading in self.DIALECTS:
                dialects.extend(self.DIALECTS[reading])

            for dialect in dialects:
                if hasattr(self.f.getReadingOperatorClass(reading),
                    'getReadingEntities'):
                    entities = self.f.getReadingEntities(reading, **dialect)
                elif reading in self.SPECIAL_ENTITY_LIST:
                    entities = self.SPECIAL_ENTITY_LIST[reading]
                else:
                    continue

                for entity in entities:
                    try:
                        results = self.characterLookup.getCharactersForReading(
                            entity, reading, **dialect)

                        self.assertEquals(type(results), type([]),
                            "Method getCharactersForReading() doesn't return" \
                                + " a list for entity %s " % repr(entity) \
                        + ' (reading %s, dialect %s)' % (reading, dialect))

                        for entry in results:
                            self.assertEquals(len(entry), 1,
                                "Entry %s in result for %s has length != 1" \
                                    % (repr(entry), repr(entity)) \
                                + ' (reading %s, dialect %s)' \
                                % (reading, dialect))
                    except exception.UnsupportedError:
                        pass
                    except exception.ConversionError:
                        pass
Пример #4
0
def main():
    language, output_encoding = locale.getdefaultlocale()

    if len(sys.argv) == 2:
        modus = sys.argv[1]
        if modus not in modi:
            print "invalid modus, choose one out of: " + ", ".join(modi.keys())
            sys.exit(1)
    else:
        print "give a modus, choose one out of: " + ", ".join(modi.keys())
        sys.exit(1)

    fromReading, toReading, entryFunc, readingOpt = modi[modus]

    initialRules = INITIAL_RULES[(fromReading, toReading)]
    finialRules = FINAL_RULES[(fromReading, toReading)]
    extraSyllables = EXTRA_SYLLABLES[(fromReading, toReading)]

    # entry set
    global entrySet
    entrySet = set()
    # build table and use scheme with almost perfect grouping according to
    #   pronunciation, then use headers to get the initial's and final's
    #   pronunciation.
    op = ReadingFactory().createReadingOperator(fromReading, **readingOpt)

    # get splitted syllables, finals in first row, initials in first column
    for syllable in op.getReadingEntities():
        initial, final = op.getOnsetRhyme(syllable)
        # only apply rules if syllable isn't given an extra mapping in
        #   EXTRA_SYLLABLES
        if not syllable in extraSyllables:
            # check if we have rules
            if initialRules[initial] != None and finialRules[final] != None:
                # check for ambiguous mappings
                if type(initialRules[initial]) == type({}):
                    initialFeatures = initialRules[initial].keys()
                else:
                    initialFeatures = [None]
                if type(finialRules[final]) == type({}):
                    finalFeatures = finialRules[final].keys()
                else:
                    finalFeatures = [None]

                # go through all mappings
                for initialFeature in initialFeatures:
                    for finalFeature in finalFeatures:
                        if initialFeature:
                            targetInitial \
                                = initialRules[initial][initialFeature]
                        else:
                            targetInitial = initialRules[initial]

                        if finalFeature:
                            targetFinal = finialRules[final][finalFeature]
                        else:
                            targetFinal = finialRules[final]

                        entry = entryFunc(syllable, targetInitial, targetFinal,
                            initialFeature, finalFeature)
                        if entry != None:
                            entrySet.add(entry)
            else:
                print >> sys.stderr, ("missing rule(s) for syllable '" \
                    + syllable + "' with initial/final '" + initial + "'/'" \
                    + final + "'").encode(output_encoding)

    # print extra syllables
    for syllable in extraSyllables:
        if extraSyllables[syllable]:
            initialRule, finalRule = extraSyllables[syllable]
            # check for ambiguous mappings
            if type(initialRule) == type({}):
                initialFeatures = initialRule.keys()
            else:
                initialFeatures = [None]
            if type(finalRule) == type({}):
                finalFeatures = finalRule.keys()
            else:
                finalFeatures = [None]

            # go through all mappings
            for initialFeature in initialFeatures:
                for finalFeature in finalFeatures:
                    if initialFeature:
                        targetInitial = initialRule[initialFeature]
                    else:
                        targetInitial = initialRule

                    if finalFeature:
                        targetFinal = finalRule[finalFeature]
                    else:
                        targetFinal = finalRule

                    entry = entryFunc(syllable, targetInitial, targetFinal,
                        initialFeature, finalFeature)
                    if entry != None:
                        entrySet.add(entry)

    notIncludedSyllables = [syllable for syllable in extraSyllables \
        if not extraSyllables[syllable]]
    if notIncludedSyllables:
        print >> sys.stderr, ("Syllables not included in table: '" \
            + "', '".join(sorted(notIncludedSyllables)) + "'")\
            .encode(output_encoding)

    entryList = list(entrySet)
    entryList.sort()
    print "\n".join(entryList).encode(output_encoding)