def peredata(content_data): data = content_data.tolist() train_label = [] # 用于存储每对句子的是否相似的标签信息1,0 word_index_dic = {} # 存储全部文档中出现的词及对应的编码 sentences1_seg_list = [[] for index in range(len(data))] #存储第一个句子和第二个句子的分词结果 sentences2_seg_list = [[] for index in range(len(data))] is_get_data_info = False #flag set if is_get_data_info: get_data_info(data) qinghuaSeg = pkuseg.pkuseg() for i in range(len(data)): sen1 = data[i][0].split(',')[0] sen2 = data[i][0].split(',')[1] train_label.append(int(data[i][0].split(',')[2])) seg_content_data1 = qinghuaSeg.cut(sen1) seg_content_data2 = qinghuaSeg.cut(sen2) #将分词结果保存到对应数组 sentences1_seg_list[i] = seg_content_data1 sentences2_seg_list[i] = seg_content_data2 #将出现的词编码,并将词与id的对应关系保存在word_index_dic for word1 in seg_content_data1: if word1 not in word_index_dic: word_index_dic[word1] = len(word_index_dic) + 1 for word2 in seg_content_data2: if word2 not in word_index_dic: word_index_dic[word2] = len(word_index_dic) + 1 if i % 1000 == 0: print('已经完成进度', i / len(data)) if not os.path.exists('word_index_dict.json'): with open('word_index_dict.json', 'w', encoding='utf-8') as f: json.dump(word_index_dic, f, ensure_ascii=False) # save label info if not os.path.exists("train_label.npy"): np.save("train_label.npy", np.array(train_label)) print('分词结束得到%d个词,分词结果及编码保存在word_bert_dict.json' % len(word_index_dic)) #6872 print(len(train_label)) print(train_label[0]) sentences1_data = [] sentences2_data = [] for ssl1 in range(len(sentences1_seg_list)): word_vec_list1 = [] for word in sentences1_seg_list[ssl1]: word_vec_list1.append(word_index_dic[word]) sentences1_data.append(word_vec_list1) for ssl2 in range(len(sentences2_seg_list)): word_vec_list2 = [] for word in sentences1_seg_list[ssl2]: word_vec_list2.append(word_index_dic[word]) sentences2_data.append(word_vec_list2) if not os.path.exists("sentence_data1.npy"): np.save("train_data1.npy", sentences1_data) if not os.path.exists("sentence_data2.npy"): np.save("train_data2.npy", sentences2_data) return sentences1_data, sentences2_data, train_label
def word_cut(self): seg = pkuseg.pkuseg() word_set = set() for text in self.texts: words = seg.cut(text) for word in words: if word not in word_set: word_set.add(word) self.word_set = word_set
def __init__(self, ner_dict_path, stopword_file_path): self.ner_dict_path = ner_dict_path self.stopwords = set() self.pku = pkuseg.pkuseg() with open(stopword_file_path, 'r', encoding='utf-8') as file: for line in file: self.stopwords.add(line.strip()) # self.ner = ner() jieba.load_userdict(ner_dict_path)
def __cut_corpus(self): if self.seg == 'jieba' or self.seg == None: jieba.load_userdict(self.user_word_path) elif self.seg == 'pkuseg': self.myseg = pkuseg.pkuseg(user_dict=self.my_word_list) corpus_cut = [] for s in self.corpus: corpus_cut.append(self.__cut_str(s)) return corpus_cut
def pere_test_data(content_data): data = content_data.tolist() test_label = [] # 用于存储每对句子的是否相似的标签信息1,0 word_index_dic = {} # 存储全部文档中出现的词及对应的编码 sentences1_seg_list = [[] for index in range(len(data))] # 存储第一个句子和第二个句子的分词结果 sentences2_seg_list = [[] for index in range(len(data))] is_get_data_info = False # flag SET if is_get_data_info: get_data_info(data) # 对于test data,采用train data时候得到的word_index_dict信息 with open('word_index_dict.json', 'r', encoding='utf-8') as f: word_index_dic = json.load(f) qinghuaSeg = pkuseg.pkuseg() for i in range(len(data)): sen1 = data[i][0].split(',')[0] sen2 = data[i][0].split(',')[1] test_label.append(int(data[i][0].split(',')[2])) seg_content_data1 = qinghuaSeg.cut(sen1) seg_content_data2 = qinghuaSeg.cut(sen2) # 将分词结果保存到对应数组 sentences1_seg_list[i] = seg_content_data1 sentences2_seg_list[i] = seg_content_data2 if i % 1000 == 0: print('已经完成进度', i / len(data)) # save label info if not os.path.exists("test_label.npy"): np.save("test_label.npy", np.array(test_label)) print(len(test_label)) print(test_label[0]) sentences1_data = [] sentences2_data = [] for ssl1 in range(len(sentences1_seg_list)): word_vec_list1 = [] for word in sentences1_seg_list[ssl1]: if word in word_index_dic: word_vec_list1.append(word_index_dic[word]) else: word_vec_list1.append(0) sentences1_data.append(word_vec_list1) for ssl2 in range(len(sentences2_seg_list)): word_vec_list2 = [] for word in sentences1_seg_list[ssl2]: if word in word_index_dic: word_vec_list2.append(word_index_dic[word]) else: word_vec_list2.append(0) sentences2_data.append(word_vec_list2) if not os.path.exists("test_data1.npy"): np.save("test_data1.npy", sentences1_data) if not os.path.exists("test_data2.npy"): np.save("test_data2.npy", sentences2_data) return sentences1_data, sentences2_data, test_label
def __init__(self, spo_files, pku_model_name='default', predicate=False): self.predicate = predicate self.spo_file_paths = [config.KGS.get(f, f) for f in spo_files] self.lookup_table = self._create_lookup_table() self.segment_vocab = list( self.lookup_table.keys()) + config.NEVER_SPLIT_TAG self.tokenizer = pkuseg.pkuseg(model_name=pku_model_name, postag=False, user_dict=self.segment_vocab) self.special_tags = set(config.NEVER_SPLIT_TAG)
def load_pkuseg_model(path): try: import pkuseg except ImportError: if self.use_pkuseg: raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG) if path.exists(): self.pkuseg_seg = pkuseg.pkuseg(path)
def __init__( self, opList, #[原始名称,可能名称]列表 user_dict=None): #用户自定义词典 if user_dict is None: self.seg = pkuseg.pkuseg(user_dict=user_dict, postag=False) else: self.seg = pkuseg.pkuseg(postag=False) self.instructions = [] corpus = [] for row in opList: question = row[0] corpusRow = [] for word in self.seg.cut(question): corpusRow.append(word) corpus.append(corpusRow) self.instructions.append(question) self.bm25Model = bm25.BM25(corpus) self.corpus = corpus
def zh_word_seg_by_pku(list_of_sentences, user_dict=[]): """ Tokenize Chinese words by pkuseg :params list_of_sentences (list): [ sentence_a (str), sentence_b (str), ... ] user_dict (list): customized dictionary, e.g., [ '你好', '朋友', ... ] """ user_dict = user_dict if user_dict else 'default' seg = pkuseg.pkuseg(user_dict) return list(map(lambda x: seg.cut(x), list_of_sentences))
def _prepare(self): # 词性预处理 # 词性参考 https://github.com/lancopku/pkuseg-python/blob/master/tags.txt # jio.util.pkuseg_postag_loader() self.pos_name = set(pkuseg_postag_loader().keys()) self.pos_exception = set(['u', 'p', 'c', 'y', 'e', 'o', 'w']) self.loose_pos_name = self.pos_name - self.pos_exception self.strict_pos_name = [ 'a', 'n', 'j', 'nr', 'ns', 'nt', 'nx', 'nz', 'ad', 'an', 'vn', 'vd', 'vx' ] # 去除冗余短语的规则 self.redundent_strict_pattern = re.compile( '[\*\|`\;:丨-\<\>]') # 有一个字符即抛弃 self.redundent_loose_pattern = re.compile( '[/\d\.\-:=a-z+,%]+') # 全部是该字符即抛弃 self.extra_date_ptn = re.compile('\d{1,2}[月|日]') # 加载 idf,计算其 oov 均值 self.idf_dict = idf_loader() self.median_idf = sorted(self.idf_dict.values())[len(self.idf_dict) // 2] self.seg = pkuseg.pkuseg(postag=True) # 北大分词器 # 短语长度权重字典,调整绝大多数的短语要位于 2~6 个词之间 # 根据人工抽取的关键短语结果,短语词长有一个泊松分布,而根据 idf 和 lda 概率的结果,也有一个 # 分布,两者之间存在偏差,因此,直接对短语长度分布采用权重进行调节,使抽取分布逼近人工的分布。 self.phrases_length_control_dict = { 1: 1, 2: 5.6, 3: 1.1, 4: 2.0, 5: 0.7, 6: 0.9, 7: 0.48, 8: 0.43, 9: 0.24, 10: 0.15, 11: 0.07, 12: 0.05 } self.phrases_length_control_none = 0.01 # 在大于 7 时选取 # 短语词性组合权重字典 with open(os.path.join(DIR_PATH, 'pos_combine_weights.json'), 'r', encoding='utf8') as f: self.pos_combine_weights_dict = json.load(f) # 读取停用词文件 self.stop_words = stopwords_loader() # 加载 lda 模型参数 self._lda_prob_matrix()
def __init__(self, pretrain_path, max_length, hidden_size, att_dim, is_zh=False): super(AttBiLSTMEncoder, self).__init__() self.is_zh = is_zh if not self.is_zh: # English self.token2idx = json.load( open( os.path.join(pretrain_path, "att-bi-lstm", "token2idx.json"), "r")) word_vec = torch.from_numpy( np.load( os.path.join(pretrain_path, "att-bi-lstm", "word_vec.npy"))) else: # Chinese self.token2idx = json.load( open( os.path.join(pretrain_path, "att-bi-lstm-zh", "token2idx.json"), "r")) word_vec = torch.from_numpy( np.load( os.path.join(pretrain_path, "att-bi-lstm-zh", "word_vec.npy"))) import pkuseg self.seg = pkuseg.pkuseg() self.word_count, self.word_vec_dim = word_vec.shape[0], word_vec.shape[ 1] # Unknown, Blank self.unk_idx, self.blk_idx = self.word_count, self.word_count + 1 unk = torch.randn(1, self.word_vec_dim, dtype=torch.double) / math.sqrt(self.word_vec_dim) blk = torch.zeros(1, self.word_vec_dim, dtype=torch.double) # Embedding layer self.word_embedding = nn.Embedding(self.word_count + 2, self.word_vec_dim, padding_idx=self.blk_idx) self.word_embedding.weight.data.copy_( torch.cat((word_vec, unk, blk), 0)) # Self-Att Bi-LSTM layer self.bilstm = nn.LSTM(self.word_vec_dim, hidden_size, batch_first=True, bidirectional=True) self.att1 = nn.Linear(2 * hidden_size, att_dim, bias=False) self.att2 = nn.Linear(att_dim, 1, bias=False) self.max_length = max_length
def load_file(path): en = [] cn = [] seg = pkuseg.pkuseg() with open(path, 'r') as f: for line in f.readlines(): line = line.strip().split('\t') en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"]) cn.append(["BOS"] + seg.cut(line[1]) + ["EOS"]) return en, cn
def getcityname(text): seg = pkuseg.pkuseg() #以默认配置加载模型 srcArray = seg.cut(text) #进行分词 print(srcArray) for val in cityname: if val in srcArray: print('faxian'+val) return val print('not find') return None
def __init__(self, config): self.config = config self.seg = pkuseg.pkuseg() if self.config.use_word else None # load vocab if exist else create vocab_path = config.vocab_path print('Loading vocab from', vocab_path, ' ...') self.vocab = pkl.load( open(vocab_path, 'rb')) if os.path.exists(vocab_path) else self.build_vocab() print('Complete! Vocab size: {}'.format(len(self.vocab)))
def calcTop50(): npath = os.getcwd() filename = npath + '\\ntlk1\\area1.txt' txt = open(filename, encoding="utf-8").read() seg = pkuseg.pkuseg() #ctxt = jieba.lcut(txt) ctxt = seg.cut(txt) items = getTop(ctxt) #printTop(items, 50) ShowImgHotWord(ctxt, 50)
def __init__(self, config, norm=True, postag=True, **kwargs): """ @param: user_dict """ self.config = config self.norm = norm self.postag = postag self.en_handler = spacy.load("en_core_web_sm") self.seghandler = pkuseg.pkuseg(user_dict=config['tokenizer']['user_dict'], postag=postag, **kwargs) self.init()
def make_articles_from_contents(article_names: List[str], article_contents: List[str]): seg = pkuseg.pkuseg() cutter = lambda sentence: seg.cut(sentence) articles = list() for i in tqdm(range(len(article_names))): articles.append( Article(name=article_names[i], terms=cutter(article_contents[i]))) return articles
def test_segment_sentence_list(): # 一般性测试方法 seg_tool = pkuseg.pkuseg(postag=True) seg = WordSegmentation(is_lower=False, is_use_stop_words=False, is_use_word_tags_filter=False) expected = [] for sentence in SENTENCE_LIST: sentence_cutted = seg_tool.cut(sentence) sentence_cutted = [item[0] for item in sentence_cutted] expected.append(sentence_cutted) assert seg.segment_sentence_list(SENTENCE_LIST) == expected # 测试停用词滤除方法 stop_words_vocab = ["根据", "了", ")", "("] seg = WordSegmentation(is_lower=False, is_use_stop_words=True, is_use_word_tags_filter=False, stop_words_vocab=stop_words_vocab) expected = [] for sentence in SENTENCE_LIST: sentence_cutted = seg_tool.cut(sentence) sentence_cutted = [item[0] for item in sentence_cutted \ if item[0] not in stop_words_vocab] expected.append(sentence_cutted) assert seg.segment_sentence_list(SENTENCE_LIST) == expected # 测试基于词性的滤除方法 allow_word_tags = ["n", "v"] seg = WordSegmentation(is_lower=False, is_use_stop_words=False, is_use_word_tags_filter=True, allow_word_tags=allow_word_tags) expected = [] for sentence in SENTENCE_LIST: sentence_cutted = seg_tool.cut(sentence) sentence_cutted = [item[0] for item in sentence_cutted \ if item[1] in allow_word_tags] expected.append(sentence_cutted) assert seg.segment_sentence_list(SENTENCE_LIST) == expected # 测试大小写是否正常转换 seg = WordSegmentation(is_lower=True) expected = [] for sentence in SENTENCE_LIST: sentence_cutted = seg_tool.cut(sentence) sentence_cutted = [item[0].lower() for item in sentence_cutted] expected.append(sentence_cutted) assert seg.segment_sentence_list(SENTENCE_LIST) == expected
def pkuseg_pos(string): print('PkuSeg的分词和词性标注:') num = len(string) print(num) start_time = datetime.now() for s in string: seg = pkuseg.pkuseg(postag=True) # 加载模型,给定用户词典 pos_list = seg.cut(s) all_time = (datetime.now() - start_time).total_seconds() avg = all_time / num print('pos_tag time used: {} sec'.format(avg)) print('\n\n')
def from_bytes(self, data, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" pkuseg_processors_data = None def deserialize_pkuseg_features(b): nonlocal pkuseg_features_b pkuseg_features_b = b def deserialize_pkuseg_weights(b): nonlocal pkuseg_weights_b pkuseg_weights_b = b def deserialize_pkuseg_processors(b): nonlocal pkuseg_processors_data pkuseg_processors_data = srsly.msgpack_loads(b) deserializers = OrderedDict(( ("cfg", lambda b: self._set_config(srsly.json_loads(b))), ("pkuseg_features", deserialize_pkuseg_features), ("pkuseg_weights", deserialize_pkuseg_weights), ("pkuseg_processors", deserialize_pkuseg_processors), )) util.from_bytes(data, deserializers, []) if pkuseg_features_b and pkuseg_weights_b: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: fileh.write(pkuseg_features_b) with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_weights_b) try: import pkuseg except ImportError: raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) if pkuseg_processors_data: ( user_dict, do_process, common_words, other_words, ) = pkuseg_processors_data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) return self
def __init__(self, ner_dict_path, stopword_file_path): self.ner_dict_path = ner_dict_path self.stopwords = set() self.pku = pkuseg.pkuseg() with open(stopword_file_path, 'r', encoding='utf-8') as file: for line in file: self.stopwords.add(line.strip()) # self.ner = ner() jieba.load_userdict(ner_dict_path) jieba.analyse.set_stop_words(stopword_file_path) self.not_word = '[\n\t,,。`……·\u200b!!??“”""' '~::;;{}+-——=、/.()(|)%^&*@#$ <>《》【】[]\\]' self.key_word_pos = ('ns', 'n', 'vn', 'v', 'l', 'j', 'nr', 'nrt', 'nt', 'nz', 'nrfg', 'an', 's')
def tokenize_words(doc, filter_exist=None): # input: doc is a long string # filter_exist, jieba or pkuseg # output: dict_doc = {word: count, word: count, ...} # cws_model, a words splitting model, jieba, pkuseg, or None if filter_exist == "pkuseg": print("Use pkuseg tokenizer ...") cws_model = pkuseg.pkuseg() else: print("Use jieba tokenizer ...") cws_model = jieba dict_doc = dict(Counter(list(cws_model.cut(doc)))) return dict_doc, cws_model
def __init__(self, print_seg=False): self.seg = pkuseg.pkuseg(postag=True) self.print_seg = print_seg with open(os.path.join(os.path.dirname(__file__), "xdic.pkl"), "rb") as f: self.dic = pickle.load(f) with open(os.path.join(os.path.dirname(__file__), "postag"), "r") as f: self.postag = dict((k.split(" ")[0], k.split(" ")[2]) for k in f.read().splitlines()) with open(os.path.join(os.path.dirname(__file__), "t2s"), "r") as f: self.t2s = dict((k.split("\t")[0], k.split("\t")[1]) for k in f.read().splitlines())
def pkuseg_cut(self): model_path = r'D:\Anaconda\python\envs\spider\Lib\site-packages\pkuseg\models\default\tourism' seg = pkuseg.pkuseg(model_name=model_path) lines = [] df = pd.read_excel(self.excel_path) sentences = df['sentences'] for sentence in sentences: if sentence is not np.nan: # print(sentence) cut = seg.cut(sentence) lines.append(' '.join(cut)) # print(lines) self.save_to_excel(lines)
def get_pkuseg_result(sentences): """ Ref to: https://github.com/lancopku/pkuseg-python Install by: `pip3 install pkuseg` You should noticed that pkuseg-python only support python3 """ import pkuseg seg = pkuseg.pkuseg() preds = [] for sentence in sentences: sent_seg = " ".join(seg.cut(sentence)) sent_seg = to_unicode(sent_seg) preds.append(sent_seg) return preds
def encode_test_data1(data): ''' 为bert lstm pooling mlp造的100条测试数据 :param data: :return: test_data1.npy test_data2.npy test_label.npy ''' test_label = [] # 用于存储每对句子的是否相似的标签信息1,0 sentences1_seg_list = [[] for index in range(len(data))] # 存储第一个句子和第二个句子的分词结果 sentences2_seg_list = [[] for index in range(len(data))] word_index_dic = {} qinghuaSeg = pkuseg.pkuseg() for i in range(len(data)): sen1 = data[i][0].split(',')[0] sen2 = data[i][0].split(',')[1] test_label.append(int(data[i][0].split(',')[2])) seg_content_data1 = qinghuaSeg.cut(sen1) seg_content_data2 = qinghuaSeg.cut(sen2) # 将分词结果保存到对应数组 sentences1_seg_list[i] = seg_content_data1 sentences2_seg_list[i] = seg_content_data2 # print('已经完成进度', i / 100) # save label info if not os.path.exists("test_label.npy"): np.save("test_label.npy", np.array(test_label)) with open('word_index_dict.json', 'r', encoding='utf-8') as f: word_index_dic = json.load(f) sentences1_data = [] sentences2_data = [] for ssl1 in range(len(sentences1_seg_list)): word_vec_list1 = [] for word in sentences1_seg_list[ssl1]: if word in word_index_dic: word_vec_list1.append(word_index_dic[word]) else: word_vec_list1.append(0) sentences1_data.append(word_vec_list1) for ssl2 in range(len(sentences2_seg_list)): word_vec_list2 = [] for word in sentences1_seg_list[ssl2]: if word in word_index_dic: word_vec_list2.append(word_index_dic[word]) else: word_vec_list2.append(0) sentences2_data.append(word_vec_list2) if not os.path.exists("test_data1.npy"): np.save("test_data1.npy", sentences1_data) if not os.path.exists("test_data2.npy"): np.save("test_data2.npy", sentences2_data)
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def read_test_corpus(file_path): """读取语料 :param file_path: :param type: :return: """ src_data = [] seg = pkuseg.pkuseg() with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as fout: for line in tqdm(fout.readlines(), desc='reading corpus'): if line is not None: src_data.append(seg.cut(line)) return src_data
def cut_news(): mp_news_word_lst = {} seg = pkuseg.pkuseg(model_name='news') # 程序会自动下载所对应的细领域模型 for news_id, (news_title, news_content, news_time) in tqdm(mp_news_txt.items()): text = news_title + " " + news_content # 新闻标题和正文都用 # text = news_title # 只用新闻标题 word_list = list( filter(lambda x: len(x) > 0 and x not in stop_words, map(my_utils.clean_word, seg.cut(text)))) mp_news_word_lst[news_id] = word_list my_utils.write_pkl(mp_news_word_lst, config['DEFAULT']['path_all_news_word_list'])
def segment_file(src_file, tgt_file): seg = pkuseg.pkuseg() with open(src_file) as src: all_json_data = [json.loads(line) for line in src] with multiprocessing.Pool(processes=8) as pool: segmented = list( pool.map(partial(segment_single_item, seg), all_json_data, chunksize=1024)) with open(tgt_file, 'w') as tgt: for s in segmented: tgt.write(json.dumps(s, ensure_ascii=False) + '\n')
import jieba_fast as jieba import pkuseg seg = pkuseg.pkuseg() class docA(object): def __init__(self, title: str, content: str, *args, author: str, year: int, category: str, tags: str,stem_mode:'str'='jieba' ,**kw): self.title = title self.content = content self.author = author self.year = year self.category = category self.tags = tags # self.stem = self.stem = self.stemInit(stem_mode) # self.misc = misc def stemInit(self,stem_mode): if stem_mode == 'jieba': return ' '.join(jieba.cut_for_search(self.content)) elif stem_mode == 'pkuseg': return ' '.join(seg.cut(self.content))