Exemplo n.º 1
0
 def cut_word(self, sentences):
     words = []
     if isinstance(sentences, list):
         if get_chinese_ratio(sentences) > 0.5:
             [
                 words.extend(
                     jieba.Tokenizer.lcut(tokenizer,
                                          sentence,
                                          cut_all=False))
                 for sentence in sentences
             ]
         else:
             [
                 words.extend(re.split(" ", sentence))
                 for sentence in sentences
             ]
     else:
         if get_chinese_ratio(sentences) > 0.5:
             words = jieba.Tokenizer.lcut(tokenizer,
                                          sentences,
                                          cut_all=False)
         else:
             words = re.split(" ", sentences)
     pattern = "[ |&| |、]"
     words = [
         re.sub(pattern, "", word) for word in words
         if re.sub(pattern, "", word) != ""
     ]
     return words
Exemplo n.º 2
0
 def analysis(self, sentences, word, num, pattern=None):
     if pattern is None:
         pattern = re.compile("[ ](" + '|'.join([word]) + ")[ ]")
     else:
         pattern = re.compile(f"{pattern.pattern[:4]}{word}|{pattern.pattern[4:]}")
     is_chinese = get_chinese_ratio(word) > 0.5
     all_words = []
     for sentence in sentences:
         sentence_list = sentence[3:]
         for sentence in sentence_list:
             if is_chinese:
                 # jieba.add_word(word)
                 # s_cut = jieba.lcut(sentence)
                 # jieba.del_word(word)
                 s_cut = []
                 start = sentence.index(word)
                 end = start + len(word)
                 prefix_s = sentence[:start]
                 suffix_s = sentence[end:]
                 s_cut.extend(jieba.lcut(prefix_s))
                 s_cut.append(word)
                 s_cut.extend(jieba.lcut(suffix_s))
             else:
                 s_cut = []
                 phrase_iterator = pattern.finditer(f" {sentence} ")
                 last_end = 0
                 for phrase_sre in phrase_iterator:
                     start = phrase_sre.start()
                     end = phrase_sre.end()
                     phrase_prefix_str = sentence[last_end:start]
                     phrase_prefix = [w for w in phrase_prefix_str.split(" ") if w.strip() != ""]
                     s_cut.extend(phrase_prefix)
                     phrase = phrase_sre.group()
                     phrase = phrase.strip()
                     s_cut.append(phrase)
                     last_end = end
                 phrase_suffix_str = sentence[last_end:]
                 phrase_suffix = [w for w in phrase_suffix_str.split(" ") if w.strip() != ""]
                 s_cut.extend(phrase_suffix)
             if not word in s_cut:
                 continue
             s_cut = [s for s in s_cut if s.strip() != ""]
             index = s_cut.index(word)
             s = max(index - num, 0)
             e = index + num
             words = s_cut[s:e]
             words.remove(word)
             all_words.extend(words)
     dict_word_count = Counter(all_words)
     dict_word_count = {k: v for k, v in dict_word_count.items() if k != ""}
     dic_word_count_sort = sorted(dict_word_count.items(), key=lambda d: d[1], reverse=False)
     return dic_word_count_sort
Exemplo n.º 3
0
def analyze_word(texts):
    # 文本预处理
    # pattern = re.compile(u'―|、|\r|\t|\n|\.|-|:|;|\)|\(|\?|《|》|\[|\]|"|,|,| |。|?|;|#|“|”|%|…|.|【|】|:')  # 定义正则表达式匹配模式
    # string_data = re.sub(pattern, '', text)  # 将符合模式的字符去除
    string_data = pretreatment_texts(texts)

    # 文本分词
    if get_chinese_ratio(string_data) > 0.5:
        seg_list_exact = jieba.lcut(string_data, cut_all=False)  # 精确模式分词
    else:
        seg_list_exact = re.split(" ", string_data)

    # 词频统计
    word_counts = collections.Counter(seg_list_exact)  # 对分词做词频统计
    return dict(word_counts)
Exemplo n.º 4
0
    def add_text3(self, datas, show=True):
        """
        處理數據
        :param datas: 句子 list ["句子1", "句子2"]
        :return:
        """
        def console(datas, **kwargs):
            for data in datas:
                yield data

        if show:
            console = tqdm
        for word in console(datas, desc='读取数据进度条'):
            words = word.strip()
            if len(words) == 0:
                continue
            is_chinese = get_chinese_ratio(words) > 0.5  # 是否是中文
            pattern = '[^\u4e00-\u9fa50-9a-zA-Z]' if is_chinese else '[^\u4e00-\u9fa50-9a-zA-Zа-яА-Я ]'
            for lines in re.split(pattern,
                                  words):  # 非 中文编码的开始和结束的两个值+字母+数字+俄语 进行切分
                if is_chinese:
                    match = list(jieba.cut(lines))
                else:
                    match = re.split(" ", lines)
                lens = len(match)
                self.all_words_len += lens
                for i in range(lens):
                    for j in range(1, self.max_split + 1):
                        if i + j <= lens:
                            k = ''.join(match[i:i +
                                              j]) if is_chinese else ' '.join(
                                                  match[i:i + j])
                            if len(match[i:i + j]) < 2:
                                continue
                            if k in self.vocab:
                                w = self.vocab[k]
                            else:
                                w = [0, 0, set(), set()]
                                self.vocab[k] = w
                            w[0] += 1
                            w[1] = w[0] / self.all_words_len
                            if i != 0:
                                w[2].add(match[i - 1])
                            if i + j != lens:
                                w[3].add(match[i + j])
                            else:  # 候选词的个数大于该句子的长度时立即停止
                                break
Exemplo n.º 5
0
 def search(self, query):
     now_index_time = os.path.getmtime(self.index_path)
     if now_index_time != self.index_time:
         # 若索引文件被修改,则重新读取索引
         self.read_index()
     if get_chinese_ratio(query) > 0.5:
         words = jieba.Tokenizer.lcut(tokenizer, query)
     else:
         words = []
         phrase_iterator = self.pattern.finditer(f" {query} ")
         last_end = 0
         for phrase in phrase_iterator:
             start = phrase.start()
             end = phrase.end()
             texts_prefix = query[last_end:start]
             words_prefix = [
                 w for w in texts_prefix.split(" ") if w.strip() != ""
             ]
             words.extend(words_prefix)
             word = phrase.group()
             word = word.strip()
             words.append(word)
             last_end = end
         texts_suffix = query[last_end:]
         words_suffix = [
             w for w in texts_suffix.split(" ") if w.strip() != ""
         ]
         words.extend(words_suffix)
     words = set(words)
     results = {}
     for word in words:
         if word in self.index:
             results[word] = self.index[word]
         # else:
         #     results[word] = []
     return results