예제 #1
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        # bert预训练模型
        t6 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t6))
        self.initialized_detector = True
예제 #2
0
 def initialize_rule_bert_corrector(self):
     t1 = time.time()
     # same pinyin
     self.same_pinyin = load_same_pinyin(self.same_pinyin_text_path)
     # same stroke
     self.same_stroke = load_same_stroke(self.same_stroke_text_path)
     logger.debug(
         "Loaded same pinyin file: %s, same stroke file: %s, spend: %.3f s."
         % (self.same_pinyin_text_path, self.same_stroke_text_path,
            time.time() - t1))
     self.initialized_rule_bert_corrector = True
예제 #3
0
 def initialize_bert_detector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
     self.MASK_TOKEN = "[MASK]"
     self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
         [self.MASK_TOKEN])[0]
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                  (self.bert_model_dir, time.time() - t1))
     self.initialized_bert_detector = True
 def initialize_corrector(self):
     t1 = time.time()
     # chinese common char dict
     self.cn_char_set = load_char_set(self.common_char_path)
     # same pinyin
     self.same_pinyin = load_same_pinyin(self.same_pinyin_text_path)
     # same stroke
     self.same_stroke = load_same_stroke(self.same_stroke_text_path)
     logger.debug(
         "Loaded same pinyin file: %s, same stroke file: %s, spend: %.3f s."
         % (self.same_pinyin_text_path, self.same_stroke_text_path,
            time.time() - t1))
     self.initialized_corrector = True
예제 #5
0
 def set_custom_word(self, path):
     self.check_detector_initialized()
     word_freqs = self.load_word_freq_dict(path)
     # 合并字典
     self.custom_word_freq.update(word_freqs)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_freq)
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                custom_word_freq_dict=self.custom_word_freq,
                                custom_confusion_dict=self.custom_confusion)
     for k, v in word_freqs.items():
         self.set_word_frequency(k, v)
     logger.debug('Loaded custom word path: %s, size: %d' %
                  (path, len(word_freqs)))
예제 #6
0
    def __init__(self, bert_model_dir=config.bert_model_dir,
                 bert_config_path=config.bert_config_path,
                 bert_model_path=config.bert_model_path,
                 hanzi_ssc_path=config.hanzi_ssc_path):
                #  bert_config_path='../data/bert_models/chinese_finetuned_lm/config.json',
                #  bert_model_path='../data/bert_models/chinese_finetuned_lm/pytorch_model.bin'):
        super(BertCorrector, self).__init__()
        self.name = 'bert_corrector'
        t1 = time.time()
        self.hanziSSCDict = self._getHanziSSCDict(hanzi_ssc_path)
        logger.debug('Loaded ssc dict: %s, spend: %.3f s.' % (hanzi_ssc_path, time.time() - t1))
        t1 = time.time()
        self.model = pipeline('fill-mask',
                              model=bert_model_path,
                              config=bert_config_path,
                              tokenizer=bert_model_dir)
        if self.model:
            self.mask = self.model.tokenizer.mask_token
            logger.debug('Loaded bert model: %s, spend: %.3f s.' % (bert_model_dir, time.time() - t1))
        # self.score_data_file=open(config.score_2013_data_new_path,'w',encoding='utf8')
        t1 = time.time()
        self.knn = KNearestNeighbor()
        self.knnTrainingset = self.knn.loadDataset(filename=config.score_2013_data_path, split=0.75)
        logger.debug('Loaded knn training data: %s, spend: %.3f s.' % (config.score_2013_data_path, time.time() - t1))

        t1 = time.time()
        self.neural_model=neural_network_utils.load_model(config.neural_network_model_path)
        logger.debug('Loaded neural network: %s, spend: %.3f s.' % (config.neural_network_model_path, time.time() - t1))
예제 #7
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        self.initialized_detector = True
예제 #8
0
    def __init__(
        self,
        d_mdel_dir=os.path.join(
            pwd_path,
            "../data/electra_models/chinese_electra_base_discriminator_pytorch/"
        ),
        g_model_dir=os.path.join(
            pwd_path,
            "../data/electra_models/chinese_electra_base_generator_pytorch/"),
    ):
        super(ElectraCorrector, self).__init__()
        self.name = 'electra_corrector'
        t1 = time.time()
        self.g_model = pipeline("fill-mask",
                                model=config.bert_model_dir,
                                tokenizer=config.bert_model_dir)
        self.d_model = ElectraForPreTraining.from_pretrained(d_mdel_dir)

        if self.g_model:
            self.mask = self.g_model.tokenizer.mask_token
            logger.debug('Loaded electra model: %s, spend: %.3f s.' %
                         (g_model_dir, time.time() - t1))
예제 #9
0
 def set_custom_confusion_dict(self, path):
     self.check_detector_initialized()
     custom_confusion = self._get_custom_confusion_dict(path)
     self.custom_confusion.update(custom_confusion)
     logger.debug('Loaded confusion path: %s, size: %d' %
                  (path, len(custom_confusion)))
예제 #10
0
 def set_language_model_path(self, path):
     self.check_detector_initialized()
     import kenlm
     self.lm = kenlm.Model(path)
     logger.debug('Loaded language model: %s' % path)