def initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) self.char_freq = self.load_char_freq_dict(self.char_freq_path) t3 = time.time() logger.debug( 'Loaded word freq, char freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) # bert预训练模型 t6 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t6)) self.initialized_detector = True
def initialize_rule_bert_corrector(self): t1 = time.time() # same pinyin self.same_pinyin = load_same_pinyin(self.same_pinyin_text_path) # same stroke self.same_stroke = load_same_stroke(self.same_stroke_text_path) logger.debug( "Loaded same pinyin file: %s, same stroke file: %s, spend: %.3f s." % (self.same_pinyin_text_path, self.same_stroke_text_path, time.time() - t1)) self.initialized_rule_bert_corrector = True
def initialize_bert_detector(self): t1 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t1)) self.initialized_bert_detector = True
def initialize_corrector(self): t1 = time.time() # chinese common char dict self.cn_char_set = load_char_set(self.common_char_path) # same pinyin self.same_pinyin = load_same_pinyin(self.same_pinyin_text_path) # same stroke self.same_stroke = load_same_stroke(self.same_stroke_text_path) logger.debug( "Loaded same pinyin file: %s, same stroke file: %s, spend: %.3f s." % (self.same_pinyin_text_path, self.same_stroke_text_path, time.time() - t1)) self.initialized_corrector = True
def set_custom_word(self, path): self.check_detector_initialized() word_freqs = self.load_word_freq_dict(path) # 合并字典 self.custom_word_freq.update(word_freqs) # 合并切词词典及自定义词典 self.word_freq.update(self.custom_word_freq) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) for k, v in word_freqs.items(): self.set_word_frequency(k, v) logger.debug('Loaded custom word path: %s, size: %d' % (path, len(word_freqs)))
def __init__(self, bert_model_dir=config.bert_model_dir, bert_config_path=config.bert_config_path, bert_model_path=config.bert_model_path, hanzi_ssc_path=config.hanzi_ssc_path): # bert_config_path='../data/bert_models/chinese_finetuned_lm/config.json', # bert_model_path='../data/bert_models/chinese_finetuned_lm/pytorch_model.bin'): super(BertCorrector, self).__init__() self.name = 'bert_corrector' t1 = time.time() self.hanziSSCDict = self._getHanziSSCDict(hanzi_ssc_path) logger.debug('Loaded ssc dict: %s, spend: %.3f s.' % (hanzi_ssc_path, time.time() - t1)) t1 = time.time() self.model = pipeline('fill-mask', model=bert_model_path, config=bert_config_path, tokenizer=bert_model_dir) if self.model: self.mask = self.model.tokenizer.mask_token logger.debug('Loaded bert model: %s, spend: %.3f s.' % (bert_model_dir, time.time() - t1)) # self.score_data_file=open(config.score_2013_data_new_path,'w',encoding='utf8') t1 = time.time() self.knn = KNearestNeighbor() self.knnTrainingset = self.knn.loadDataset(filename=config.score_2013_data_path, split=0.75) logger.debug('Loaded knn training data: %s, spend: %.3f s.' % (config.score_2013_data_path, time.time() - t1)) t1 = time.time() self.neural_model=neural_network_utils.load_model(config.neural_network_model_path) logger.debug('Loaded neural network: %s, spend: %.3f s.' % (config.neural_network_model_path, time.time() - t1))
def initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) self.char_freq = self.load_char_freq_dict(self.char_freq_path) t3 = time.time() logger.debug( 'Loaded word freq, char freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) self.initialized_detector = True
def __init__( self, d_mdel_dir=os.path.join( pwd_path, "../data/electra_models/chinese_electra_base_discriminator_pytorch/" ), g_model_dir=os.path.join( pwd_path, "../data/electra_models/chinese_electra_base_generator_pytorch/"), ): super(ElectraCorrector, self).__init__() self.name = 'electra_corrector' t1 = time.time() self.g_model = pipeline("fill-mask", model=config.bert_model_dir, tokenizer=config.bert_model_dir) self.d_model = ElectraForPreTraining.from_pretrained(d_mdel_dir) if self.g_model: self.mask = self.g_model.tokenizer.mask_token logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
def set_custom_confusion_dict(self, path): self.check_detector_initialized() custom_confusion = self._get_custom_confusion_dict(path) self.custom_confusion.update(custom_confusion) logger.debug('Loaded confusion path: %s, size: %d' % (path, len(custom_confusion)))
def set_language_model_path(self, path): self.check_detector_initialized() import kenlm self.lm = kenlm.Model(path) logger.debug('Loaded language model: %s' % path)