Пример #1
0
    def test_parse_xml(self):
        dct = parse_opencorpora_xml(self.XML_PATH)
        assert dct.version == '0.92'
        assert dct.revision == '389440'

        assert dct.links[0] == ('5', '6', '1')
        assert len(dct.links) == 12

        assert dct.grammemes[1] == ('NOUN', 'POST', 'СУЩ', 'имя существительное')
        assert len(dct.grammemes) == 111

        assert dct.lexemes['14'] == [('ёжиться', 'INFN,impf,intr')]
Пример #2
0
    def test_parse_xml(self):
        dct = parse_opencorpora_xml(self.XML_PATH)
        assert dct.version == '0.92'
        assert dct.revision == '389440'

        assert dct.links[0] == ('5', '6', '1')
        assert len(dct.links) == 12

        assert dct.grammemes[1] == ('NOUN', 'POST', 'СУЩ',
                                    'имя существительное')
        assert len(dct.grammemes) == 111

        assert dct.lexemes['14'] == [('ёжиться', 'INFN,impf,intr')]
Пример #3
0
def make_test_suite(opencorpora_dict_path, out_path, word_limit=100):
    """
    Extract test data from OpenCorpora .xml dictionary (at least
    ``word_limit`` words for each distinct gram. tag) and save it to a file.
    """
    logger.debug('loading dictionary to memory...')
    parsed_dict = parse_opencorpora_xml(opencorpora_dict_path)

    logger.debug('preparing...')
    parses = _get_word_parses(parsed_dict.lexemes)

    logger.debug('dictionary size: %d', len(parses))

    logger.debug('handling umlauts...')
    parses = _add_ee_parses(parses)
    logger.debug('dictionary size: %d', len(parses))

    logger.debug('building test suite...')
    suite = _get_test_suite(parses, word_limit)

    logger.debug('test suite size: %d', len(suite))

    logger.debug('saving...')
    _save_test_suite(out_path, suite, parsed_dict.revision)
Пример #4
0
def make_test_suite(opencorpora_dict_path, out_path, word_limit=100):
    """
    Extract test data from OpenCorpora .xml dictionary (at least
    ``word_limit`` words for each distinct gram. tag) and save it to a file.
    """
    logger.debug('loading dictionary to memory...')
    parsed_dict = parse_opencorpora_xml(opencorpora_dict_path)

    logger.debug('preparing...')
    parses = _get_word_parses(parsed_dict.lexemes)

    logger.debug('dictionary size: %d', len(parses))

    logger.debug('handling umlauts...')
    parses = _add_ee_parses(parses)
    logger.debug('dictionary size: %d', len(parses))

    logger.debug('building test suite...')
    suite = _get_test_suite(parses, word_limit)

    logger.debug('test suite size: %d', len(suite))

    logger.debug('saving...')
    _save_test_suite(out_path, suite, parsed_dict.revision)