示例#1
0
class Releaser:
    def __init__(self):
        self.config = CONFIG
        self.dataset_path = self.config['dict_path']
        self.model_key = self.config['model_key']
        self.chars = self.config['chars']
        self.gram_types = self.config['grammemes_types']
        self.rnn = RNN(True)
        self.tester = Tester()
        self.pd_publish_paths = [
            os.path.join(path, f"frozen_model_{self.model_key}.pb")
            for path in self.config['publish_net_paths']
        ]
        self.xml_publish_paths = [
            os.path.join(path, f"release_{self.model_key}.xml")
            for path in self.config['publish_net_paths']
        ]
        self.xml_gram_paths = [
            os.path.join(path, "grams.xml")
            for path in self.config['publish_gramm_paths']
        ]
        self.xml_numbers_paths = [
            os.path.join(path, "numbers.xml")
            for path in self.config['publish_numbers_paths']
        ]
        self.xml_tags_paths = [
            os.path.join(path, "tags.xml")
            for path in self.config['publish_tags_paths']
        ]
        self.test_result_paths = [
            os.path.join(path, "test_info.txt")
            for path in self.config['test_results_paths']
        ]
        self.publish_dataset_info_paths = [
            os.path.join(path, "dataset_info.txt")
            for path in self.config['publish_dataset_info_paths']
        ]
        self.public_inflect_templates_paths = [
            os.path.join(path, "inflect_templates.xml")
            for path in self.config['public_inflect_templates_paths']
        ]
        self.classes_dic = self.config['main_classes']
        self.rev_classes_dic = {
            self.classes_dic[key]:
            ",".join([key for key in list(key) if key is not None])
            for key in self.classes_dic
        }
        with open(CONFIG['tags_path'], 'rb') as f:
            self.tags = pickle.load(f)

        with open(CONFIG['numb_data_path'], 'rb') as f:
            self.numb_data = pickle.load(f)

        with open(self.config['inflect_templates_path'], 'rb') as f:
            self.inflect_templates = pickle.load(f)

    def release_model(self):
        pd_release_path, gram_ops, out_ops = self.rnn.release()
        for path in self.pd_publish_paths:
            copyfile(pd_release_path, path)

        self.__release_test_metrics__()
        self.__release_numbers_xml__()
        self.__release_gramm_docs__()
        self.__release_inflect_docs__()
        self.__release_grams_xml__()
        self.__release_tags_xml__()
        self.__release_dataset_info__()
        self.__release_model_xml__(out_ops, gram_ops)

    def __release_test_metrics__(self):
        results = self.tester.test()
        for path in self.test_result_paths:
            with open(path, 'w+') as f:
                f.write(results)

    def __release_model_xml__(self, out_ops, gram_ops):
        root = etree.Element('Root')
        for key in out_ops:
            root.set(key, out_ops[key])

        chars_el = etree.Element('Chars')
        chars_el.set("start_char", str(self.config['start_token']))
        chars_el.set("end_char", str(self.config['end_token']))
        for index, value in enumerate(self.chars):
            char_el = etree.Element("Char")
            char_el.set('index', str(index))
            char_el.set('value', value)
            chars_el.append(char_el)
        root.append(chars_el)

        grams_el = etree.Element('Grams')
        for gram in self.gram_types:
            gram_el = etree.Element("G")
            gram_el.set('key', gram)
            gram_el.set('op', gram_ops[gram]['prob'])
            grams_el.append(gram_el)
        root.append(grams_el)

        inflect_el = etree.Element("Inflect")
        for main_key in self.inflect_templates:
            temp_el = etree.Element("Im")
            temp_el.set('i', str(self.classes_dic[main_key]))
            inflect_el.append(temp_el)
            for form in self.inflect_templates[main_key]:
                form_el = etree.Element("I")
                form_el.set('i', str(self.classes_dic[form]))
                temp_el.append(form_el)
        root.append(inflect_el)

        tree = ElementTree(root)
        for path in self.xml_publish_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

        logging.info("Model released")

    def __release_grams_xml__(self):
        nn_types = self.config['grammemes_types']
        dict_post_types = self.config['dict_post_types']
        other_types = self.config['other_post_types']
        root = etree.Element('Grams')
        for gram in nn_types:
            gram_el = etree.Element("G")
            gram_el.set('index', str(nn_types[gram]['index']))
            gram_el.set('key_en', gram)
            gram_el.set('key_ru', nn_types[gram]['key_ru'])
            root.append(gram_el)
            gr_dic = nn_types[gram]['classes']
            for key_en in gr_dic:
                item = gr_dic[key_en]
                cls_el = etree.Element("C")
                cls_el.set('key_en', key_en)
                cls_el.set('key_ru', str(item['key_ru']))
                cls_el.set('nn_index', str(item['index']))
                gram_el.append(cls_el)

            if gram == "post":
                for key_en in dict_post_types:
                    item = dict_post_types[key_en]
                    cls_el = etree.Element("C")
                    cls_el.set('key_en', key_en)
                    cls_el.set('key_ru', str(item['key_ru']))
                    gram_el.append(cls_el)

                for key_en in other_types:
                    item = other_types[key_en]
                    cls_el = etree.Element("C")
                    cls_el.set('key_en', key_en)
                    cls_el.set('key_ru', str(item['key_ru']))
                    gram_el.append(cls_el)

        tree = ElementTree(root)
        for path in self.xml_gram_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_numbers_xml__(self):
        root = etree.Element('NumbData')
        root.set("reg", self.numb_data['regex'])
        root.set("l",
                 ','.join([str(i) for i in self.numb_data['lemma_cls_ids']]))
        for val in self.numb_data['numbers']:
            n_el = etree.Element("N")
            n_el.set('v', str(val))
            for tp in self.numb_data['numbers'][val]:
                if tp == 'nar_end' or tp == 'lemma':
                    continue

                for tpl in self.numb_data['numbers'][val][tp]:
                    w_el = etree.Element("W")
                    w_el.set('t', tpl[0])
                    w_el.set('i', str(tpl[1]))
                    w_el.set('k', tp)
                    n_el.append(w_el)

            nar_ends = self.numb_data['numbers'][val]['nar_end']
            for cls in nar_ends:
                w_el = etree.Element("E")
                w_el.set('t', nar_ends[cls])
                w_el.set('i', str(cls))
                n_el.append(w_el)

            root.append(n_el)

        tree = ElementTree(root)
        for path in self.xml_numbers_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_tags_xml__(self):
        root = etree.Element('Tags')
        for tag in self.tags:
            val = self.tags[tag]
            cls_el = etree.Element("T")
            cls_el.set('i', str(val['i']))
            cls_el.set(
                'v', ",".join([key if key is not None else '' for key in tag]))
            cls_el.set('p', val['p'])
            cls_el.set('o', str(val['o']))

            if val['l']:
                cls_el.set('l', '1')
            root.append(cls_el)

        tree = ElementTree(root)
        for path in self.xml_tags_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_dataset_info__(self):
        doc = etree.iterparse(self.dataset_path, events=('start', 'end'))
        itr = iter(doc)
        event, element = next(itr)
        while not (event == 'start' and element.tag == 'dictionary'):
            pass

        version = element.attrib['version']
        revision = element.attrib['revision']
        for path in self.publish_dataset_info_paths:
            with open(path, 'w+') as f:
                f.write(f"dictionary\nversion={version}\nrevision={revision}")

    def __release_gramm_docs__(self):
        mds = [
            "# Поддерживамые грамматические категории и граммемы",
            "В DeepMorphy используется слегка измененное подмножество граммем и грамматичеких категорий из словарей [OpenCorpora](http://opencorpora.org/dict.php?act=gram)."
        ]

        for gram_cat_key in self.gram_types:
            gram_cat = self.gram_types[gram_cat_key]
            mds.append(
                f"- **{gram_cat['name'].capitalize()}** (ru='{gram_cat['key_ru']}', en='{gram_cat_key}') :"
            )

            classes = dict(gram_cat['classes'])
            if gram_cat_key == 'post':
                classes.update(self.config['dict_post_types'])
                classes.update(self.config['other_post_types'])

            for gram in classes:
                gram_obj = classes[gram]
                mds.append(
                    f"    - {gram_obj['name_ru']} (ru='{gram_obj['key_ru']}',en='{gram}')"
                )

        mds = "\n".join(mds)
        with open(self.config['publish_gram_doc_path'], 'w+') as f:
            f.write(mds)

    def __release_inflect_docs__(self):
        mds = [
            "# Список поддерживаемых словоизменений",
            "Словоизменение возможно только в рамках выделенных жирным категорий:"
        ]

        post_index = self.gram_types['post']['index']
        gndr_index = self.gram_types['gndr']['index']

        en_ru_dict = {}
        for gram_cat in self.gram_types:
            for cls in self.gram_types[gram_cat]['classes']:
                cls_data = self.gram_types[gram_cat]['classes'][cls]
                en_ru_dict[cls] = cls_data['key_ru']

        def create_tag_text(tag):
            tag_text = [
                en_ru_dict[key] for key in list(tag) if key is not None
            ]
            tag_text = ",".join(tag_text)
            return f"    - {tag_text}"

        for main_tpl in sorted(self.inflect_templates):
            post = main_tpl[post_index]
            gndr = main_tpl[gndr_index]
            if post == "infn":
                header_text = "Глаголы и глагольные формы"
            elif post == "adjf":
                header_text = "Прилагательные"
            elif post == "noun" and gndr == 'masc':
                header_text = "Существительные мужского рода"
            elif post == "noun" and gndr == 'femn':
                header_text = "Существительные женского рода"
            elif post == "noun" and gndr == 'neut':
                header_text = "Существительные среднего рода"
            elif post == "noun" and gndr == 'msf':
                header_text = "Существительные общего рода"
            else:
                raise NotImplemented()

            mds.append(f"- **{header_text}**:")
            items = [(item, self.tags[item]['o'])
                     for item in self.inflect_templates[main_tpl]]
            items.append((main_tpl, self.tags[main_tpl]['o']))
            tags = sorted(items, key=lambda x: x[1], reverse=True)
            for tag in tags:
                mds.append(create_tag_text(tag[0]))

        mds = "\n".join(mds)
        with open(self.config['publish_inflect_doc_path'], 'w+') as f:
            f.write(mds)

    @staticmethod
    def __build_bad_words__(tester):
        words = tester.get_bad_words()
        logging.info(f"Wrong words count {len(words)}")
        with open(os.path.join("wrong_words.pkl"), 'wb+') as f:
            pickle.dump(words, f)
示例#2
0
class Releaser:
    def __init__(self):
        self.config = config()
        self.dataset_path = self.config['dict_path']
        self.model_key = self.config['model_key']
        self.chars = self.config['chars']
        self.gram_types = self.config['grammemes_types']
        self.rnn = RNN(True)
        self.pd_publish_paths = [
            os.path.join(path, f"frozen_model_{self.model_key}.pb")
            for path in self.config['publish_net_paths']
        ]
        self.xml_publish_paths = [
            os.path.join(path, f"release_{self.model_key}.xml")
            for path in self.config['publish_net_paths']
        ]
        self.xml_gram_paths = [
            os.path.join(path, "grams.xml")
            for path in self.config['publish_gramm_paths']
        ]
        self.test_result_paths = [
            os.path.join(path, "test_info.txt")
            for path in self.config['test_results_paths']
        ]
        self.publish_dataset_info_paths = [
            os.path.join(path, "dataset_info.txt")
            for path in self.config['publish_dataset_info_paths']
        ]
        self.tests_results_paths = self.config['publish_test_paths']
        self.classes_dic = self.config['main_classes']
        self.rev_classes_dic = {
            self.classes_dic[key]:
            ",".join([key for key in list(key) if key is not None])
            for key in self.classes_dic
        }

    def release_model(self):
        pd_release_path, gram_ops, out_ops = self.rnn.release()
        for path in self.pd_publish_paths:
            copyfile(pd_release_path, path)

        self.__release_gramm_docs__()
        self.__release_grams_xml__()
        self.__release_test_files__()
        self.__release_dataset_info__()
        self.__release_model_xml__(out_ops, gram_ops)

        testr = Tester()
        self.__release_test_results__(testr)
        self.__build_bad_words__(testr)

    def __release_model_xml__(self, out_ops, gram_ops):
        root = etree.Element('Root')
        for key in out_ops:
            root.set(key, out_ops[key])

        chars_el = etree.Element('Chars')
        chars_el.set("start_char", str(self.config['start_token']))
        chars_el.set("end_char", str(self.config['end_token']))
        for index, value in enumerate(self.chars):
            char_el = etree.Element("Char")
            char_el.set('index', str(index))
            char_el.set('value', value)
            chars_el.append(char_el)
        root.append(chars_el)

        grams_el = etree.Element('Grams')
        for gram in self.gram_types:
            gram_el = etree.Element("G")
            gram_el.set('key', gram)
            gram_el.set('op', gram_ops[gram]['prob'])
            grams_el.append(gram_el)
        root.append(grams_el)

        lemma_same_words = []
        for cls in self.config['lemma_same_word']:
            key = tuple(cls[key] if key in cls else None
                        for key in self.config['grammemes_types'])
            lemma_same_words.append(key)

        classes_el = etree.Element('Classes')
        for cls in self.classes_dic:
            cls_el = etree.Element("C")
            cls_el.set('i', str(self.classes_dic[cls]))
            cls_el.set(
                'v', ",".join([key if key is not None else '' for key in cls]))
            if cls in lemma_same_words:
                cls_el.set('lsw', '1')

            classes_el.append(cls_el)

        root.append(classes_el)
        tree = ElementTree(root)
        for path in self.xml_publish_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

        logging.info("Model released")

    def __release_grams_xml__(self):
        nn_types = self.config['grammemes_types']
        dict_post_types = self.config['dict_post_types']
        other_types = self.config['other_post_types']
        root = etree.Element('Grams')
        for gram in nn_types:
            gram_el = etree.Element("G")
            gram_el.set('index', str(nn_types[gram]['index']))
            gram_el.set('key_en', gram)
            gram_el.set('key_ru', nn_types[gram]['key_ru'])
            root.append(gram_el)
            gr_dic = nn_types[gram]['classes']
            for key_en in gr_dic:
                item = gr_dic[key_en]
                cls_el = etree.Element("C")
                cls_el.set('key_en', key_en)
                cls_el.set('key_ru', str(item['key_ru']))
                cls_el.set('nn_index', str(item['index']))
                gram_el.append(cls_el)

            if gram == "post":
                for key_en in dict_post_types:
                    item = dict_post_types[key_en]
                    cls_el = etree.Element("C")
                    cls_el.set('key_en', key_en)
                    cls_el.set('key_ru', str(item['key_ru']))
                    gram_el.append(cls_el)

                for key_en in other_types:
                    item = other_types[key_en]
                    cls_el = etree.Element("C")
                    cls_el.set('key_en', key_en)
                    cls_el.set('key_ru', str(item['key_ru']))
                    gram_el.append(cls_el)

        tree = ElementTree(root)
        for path in self.xml_gram_paths:
            with open(path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_dataset_info__(self):
        doc = etree.iterparse(self.dataset_path, events=('start', 'end'))
        itr = iter(doc)
        event, element = next(itr)
        while not (event == 'start' and element.tag == 'dictionary'):
            pass

        version = element.attrib['version']
        revision = element.attrib['revision']
        for path in self.publish_dataset_info_paths:
            with open(path, 'w+') as f:
                f.write(f"dictionary\nversion={version}\nrevision={revision}")

    def __release_test_results__(self, tester):
        results = tester.test()
        for path in self.test_result_paths:
            with open(path, 'w+') as f:
                f.write(results)

    def __release_test_files__(self):
        for gram in self.gram_types:
            cls = self.gram_types[gram]['classes']
            dic = {cls[g_key]['index']: g_key for g_key in cls}
            self.__release_cls_tests__(gram, dic)

        self.__release_cls_tests__('main', self.rev_classes_dic)
        self.__release_lemma_tests__()

    def __release_cls_tests__(self, key, cls_dic):
        path = os.path.join(self.config['dataset_path'],
                            f"{key}_test_dataset.pkl")
        with open(path, 'rb') as f:
            words = pickle.load(f)

        root = etree.Element('Tests')
        for word in words:
            y = np.argwhere(word['y'] == 1).ravel()
            y = ';'.join([cls_dic[index] for index in y])
            test = etree.Element("T")
            test.set('x', word['src'])
            test.set('y', y)
            root.append(test)

        for dir_path in self.tests_results_paths:
            rez_path = os.path.join(dir_path, f'{key}.xml')
            tree = ElementTree(root)
            with open(rez_path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_lemma_tests__(self):
        path = os.path.join(self.config['dataset_path'],
                            "lemma_test_dataset.pkl")
        with open(path, 'rb') as f:
            words = pickle.load(f)

        root = etree.Element('Tests')
        for word in words:
            test = etree.Element("T")
            test.set('x', word['x_src'])
            test.set('y', word['y_src'])
            root.append(test)

        for dir_path in self.tests_results_paths:
            rez_path = os.path.join(dir_path, 'lem.xml')
            tree = ElementTree(root)
            with open(rez_path, 'wb+') as f:
                tree.write(f, xml_declaration=True, encoding='utf-8')

    def __release_gramm_docs__(self):
        mds = [
            "# Поддерживамые грамматические категории и граммемы",
            "В DeepMorphy используется слегка измененное подмножество граммем и грамматичеких категорий из словарей [OpenCorpora](http://opencorpora.org/dict.php?act=gram)."
        ]

        for gram_cat_key in self.gram_types:
            gram_cat = self.gram_types[gram_cat_key]
            mds.append(
                f"- **{gram_cat['name'].capitalize()}** (ru='{gram_cat['key_ru']}', en='{gram_cat_key}') :"
            )

            classes = dict(gram_cat['classes'])
            if gram_cat_key == 'post':
                classes.update(self.config['dict_post_types'])
                classes.update(self.config['other_post_types'])

            for gram in classes:
                gram_obj = classes[gram]
                mds.append(
                    f"    - {gram_obj['name_ru']} (ru='{gram_obj['key_ru']}',en='{gram}')"
                )

        mds = "\n".join(mds)
        with open(self.config['publish_gram_doc_path'], 'w+') as f:
            f.write(mds)

    @staticmethod
    def __build_bad_words__(tester):
        words = tester.get_bad_words()
        logging.info(f"Wrong words count {len(words)}")
        with open(os.path.join("wrong_words.pkl"), 'wb+') as f:
            pickle.dump(words, f)