def tokenizer_pku(): f = open('dict.json', 'r', encoding='utf-8') dict = json.load(f) f.close() trie = Trie() trie.update(dict) print(type(trie)) text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(split_sents(text, trie)) tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG') tokenizer = hanlp.pipeline() \ .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \ .append(tokenizer, input_key='parts', output_key='tokens') \ .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged') print(tokenizer(text))
class hanLPTokenizer: def __init__(self, hanlp_tokenizer, hanlp_tagger, user_dict_path, stop_words_path, consider_tags_path, ignore_tag='-'): self.hanlp_tokenizer = hanlp_tokenizer self.tagger = hanlp_tagger self.ignore_tag = ignore_tag self.stop_words = self.load_stop_words(stop_words_path) self.considered_tags = self.load_consider_tags(consider_tags_path) self.user_dict = self.load_user_dict(user_dict_path) self.trie = Trie() self.trie.update(self.user_dict) self.tokenizer = hanlp.pipeline() \ .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \ .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \ .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged') def split_sentences(self, text: str): words = self.trie.parse_longest(text) sentences = [] pre_start = 0 offsets = [] for word, value, start, end in words: if pre_start != start: sentences.append(text[pre_start:start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sentences.append(text[pre_start:]) offsets.append(pre_start) return sentences, offsets, words @staticmethod def merge_parts(parts, offsets, words): items = [(i, p) for (i, p) in zip(offsets, parts)] items += [(start, [word]) for (word, value, start, end) in words] # In case you need the tag, use the following line instead # items += [(start, [(word, value)]) for (word, value, start, end) in words] return [each for x in sorted(items) for each in x[1]] def tokenize(self, text): """ :param text: str :return: """ return self.tokenizer(text)['merged'] def tag(self, tokens): """ :param tokens: list :return: """ return self.tagger(tokens) def tag_stop_words(self, tokens, tags): new_tags = [] for i in range(len(tokens)): if tokens[i] in self.stop_words: new_tags.append(self.ignore_tag) else: new_tags.append(tags[i]) return new_tags def tag_unconsidered_tags(self, tags): new_tags = [] for tag in tags: if tag.lower() in self.considered_tags: new_tags.append(tag) else: new_tags.append(self.ignore_tag) return new_tags def tokenize_filter(self, text): tokens = self.tokenize(text) tags = self.tag(tokens) tags = self.tag_stop_words(tokens, tags) # remove stop words tags = self.tag_unconsidered_tags(tags) # tag filter tagged_tokens = [] for i in range(len(tags)): tagged_tokens.append((tokens[i], tags[i])) return tagged_tokens @staticmethod def load_txt_data(path, mode='utf-8-sig', origin=False): """ This func is used to reading txt file :param origin: :param path: path where file stored :param mode: :type path: str :return: string lines in file in a list :rtype: list """ if type(path) != str: raise TypeError res = [] file = open(path, 'rb') lines = file.read().decode(mode, errors='ignore') for line in lines.split('\n'): line = line.strip() if origin: res.append(line) else: if line: res.append(line) file.close() return res def load_user_dict(self, path): raw = self.load_txt_data(path) user_word_dict = {} for i in range(len(raw)): word = raw[i].split('\t')[0] if word not in user_word_dict: user_word_dict[word] = ' ' return user_word_dict def load_stop_words(self, path): return set(self.load_txt_data(path) + stopwords.words('english')) def load_consider_tags(self, path): return set( [x.split('\t')[0].lower() for x in self.load_txt_data(path)])
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 from hanlp.common.trie import Trie import hanlp tokenizer = hanlp.load('LARGE_ALBERT_BASE') text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(tokenizer(text)) trie = Trie() trie.update({'自定义': 'custom', '词典': 'dict', '聪明人': 'smart'}) def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for word, value, start, end in words: if pre_start != start: sents.append(text[pre_start: start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 from hanlp.common.trie import Trie import hanlp tokenizer = hanlp.load('LARGE_ALBERT_BASE') text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(tokenizer(text)) trie = Trie() trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'}) def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for word, value, start, end in words: if pre_start != start: sents.append(text[pre_start:start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words
# .append(tokenizer, output_key='tokens') \ # .append(tagger, output_key='part_of_speech_tags') \ # .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \ # .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies') # 添加自定义字典,也就是上面的num_keys_list + units_list custrom_dict = {} keys_list = num_keys_list + units_list keys_list.sort(key=lambda x: len(x), reverse=True) for key in keys_list: custrom_dict[key] = key trie = Trie() # trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'}) trie.update(custrom_dict) def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) # https://github.com/hankcs/HanLP/blob/master/tests/demo/zh/demo_cws_trie.py # 对官方提供的自定义字典的改造,以处理以下问题: # “千米/时”应划分为完整的,而不是再划分出“米/时” keys_rm_list = [] for i, key_i in enumerate(words): for j, key_j in enumerate(words): if (i != j) and (key_i[3] == key_j[3]) and (key_i[2] < key_j[2]): keys_rm_list.append((key_j[2], key_j[3])) elif (i != j) and (key_i[3] == key_j[3]) and (key_i[2] > key_j[2]):