def cut_word(self, sentences): words = [] if isinstance(sentences, list): if get_chinese_ratio(sentences) > 0.5: [ words.extend( jieba.Tokenizer.lcut(tokenizer, sentence, cut_all=False)) for sentence in sentences ] else: [ words.extend(re.split(" ", sentence)) for sentence in sentences ] else: if get_chinese_ratio(sentences) > 0.5: words = jieba.Tokenizer.lcut(tokenizer, sentences, cut_all=False) else: words = re.split(" ", sentences) pattern = "[ |&| |、]" words = [ re.sub(pattern, "", word) for word in words if re.sub(pattern, "", word) != "" ] return words
def analysis(self, sentences, word, num, pattern=None): if pattern is None: pattern = re.compile("[ ](" + '|'.join([word]) + ")[ ]") else: pattern = re.compile(f"{pattern.pattern[:4]}{word}|{pattern.pattern[4:]}") is_chinese = get_chinese_ratio(word) > 0.5 all_words = [] for sentence in sentences: sentence_list = sentence[3:] for sentence in sentence_list: if is_chinese: # jieba.add_word(word) # s_cut = jieba.lcut(sentence) # jieba.del_word(word) s_cut = [] start = sentence.index(word) end = start + len(word) prefix_s = sentence[:start] suffix_s = sentence[end:] s_cut.extend(jieba.lcut(prefix_s)) s_cut.append(word) s_cut.extend(jieba.lcut(suffix_s)) else: s_cut = [] phrase_iterator = pattern.finditer(f" {sentence} ") last_end = 0 for phrase_sre in phrase_iterator: start = phrase_sre.start() end = phrase_sre.end() phrase_prefix_str = sentence[last_end:start] phrase_prefix = [w for w in phrase_prefix_str.split(" ") if w.strip() != ""] s_cut.extend(phrase_prefix) phrase = phrase_sre.group() phrase = phrase.strip() s_cut.append(phrase) last_end = end phrase_suffix_str = sentence[last_end:] phrase_suffix = [w for w in phrase_suffix_str.split(" ") if w.strip() != ""] s_cut.extend(phrase_suffix) if not word in s_cut: continue s_cut = [s for s in s_cut if s.strip() != ""] index = s_cut.index(word) s = max(index - num, 0) e = index + num words = s_cut[s:e] words.remove(word) all_words.extend(words) dict_word_count = Counter(all_words) dict_word_count = {k: v for k, v in dict_word_count.items() if k != ""} dic_word_count_sort = sorted(dict_word_count.items(), key=lambda d: d[1], reverse=False) return dic_word_count_sort
def analyze_word(texts): # 文本预处理 # pattern = re.compile(u'―|、|\r|\t|\n|\.|-|:|;|\)|\(|\?|《|》|\[|\]|"|,|,| |。|?|;|#|“|”|%|…|.|【|】|:') # 定义正则表达式匹配模式 # string_data = re.sub(pattern, '', text) # 将符合模式的字符去除 string_data = pretreatment_texts(texts) # 文本分词 if get_chinese_ratio(string_data) > 0.5: seg_list_exact = jieba.lcut(string_data, cut_all=False) # 精确模式分词 else: seg_list_exact = re.split(" ", string_data) # 词频统计 word_counts = collections.Counter(seg_list_exact) # 对分词做词频统计 return dict(word_counts)
def add_text3(self, datas, show=True): """ 處理數據 :param datas: 句子 list ["句子1", "句子2"] :return: """ def console(datas, **kwargs): for data in datas: yield data if show: console = tqdm for word in console(datas, desc='读取数据进度条'): words = word.strip() if len(words) == 0: continue is_chinese = get_chinese_ratio(words) > 0.5 # 是否是中文 pattern = '[^\u4e00-\u9fa50-9a-zA-Z]' if is_chinese else '[^\u4e00-\u9fa50-9a-zA-Zа-яА-Я ]' for lines in re.split(pattern, words): # 非 中文编码的开始和结束的两个值+字母+数字+俄语 进行切分 if is_chinese: match = list(jieba.cut(lines)) else: match = re.split(" ", lines) lens = len(match) self.all_words_len += lens for i in range(lens): for j in range(1, self.max_split + 1): if i + j <= lens: k = ''.join(match[i:i + j]) if is_chinese else ' '.join( match[i:i + j]) if len(match[i:i + j]) < 2: continue if k in self.vocab: w = self.vocab[k] else: w = [0, 0, set(), set()] self.vocab[k] = w w[0] += 1 w[1] = w[0] / self.all_words_len if i != 0: w[2].add(match[i - 1]) if i + j != lens: w[3].add(match[i + j]) else: # 候选词的个数大于该句子的长度时立即停止 break
def search(self, query): now_index_time = os.path.getmtime(self.index_path) if now_index_time != self.index_time: # 若索引文件被修改,则重新读取索引 self.read_index() if get_chinese_ratio(query) > 0.5: words = jieba.Tokenizer.lcut(tokenizer, query) else: words = [] phrase_iterator = self.pattern.finditer(f" {query} ") last_end = 0 for phrase in phrase_iterator: start = phrase.start() end = phrase.end() texts_prefix = query[last_end:start] words_prefix = [ w for w in texts_prefix.split(" ") if w.strip() != "" ] words.extend(words_prefix) word = phrase.group() word = word.strip() words.append(word) last_end = end texts_suffix = query[last_end:] words_suffix = [ w for w in texts_suffix.split(" ") if w.strip() != "" ] words.extend(words_suffix) words = set(words) results = {} for word in words: if word in self.index: results[word] = self.index[word] # else: # results[word] = [] return results