Пример #1
0
def get_source(source, validate, character_number):
    print(">>开始生成词库...")
    jieba.enable_paddle()
    exist_name = dict()
    if validate:
        print(">>正在加载验证词库...")
        get_name_valid('Chinese_Names', exist_name)

    print(">>正在加载姓名词库...")
    names = set()

    # 默认
    if source == 0:
        if validate:
            names = exist_name
        else:
            get_name_dat('Chinese_Names', names, character_number)
    # 诗经
    elif source == 1:
        get_name_json('诗经/shijing', names, 'content', character_number)
    # 楚辞
    elif source == 2:
        get_name_txt('楚辞', names, character_number)
    # 论语
    elif source == 3:
        get_name_json("论语/lunyu", names, 'paragraphs', character_number)
    # 周易
    elif source == 4:
        get_name_txt('周易', names, character_number)
    # 唐诗
    elif source == 5:
        for i in range(0, 58000, 1000):
            get_name_json('唐诗/poet.tang.' + str(i), names, 'paragraphs',
                          character_number)
    # 宋诗
    elif source == 6:
        for i in range(0, 255000, 1000):
            get_name_json('宋诗/poet.song.' + str(i), names, 'paragraphs',
                          character_number)
    # 宋词
    elif source == 7:
        for i in range(0, 22000, 1000):
            get_name_json('宋词/ci.song' + str(i), names, 'paragraphs',
                          character_number)
    # 自定义词库
    elif source == 8:
        get_name_txt('自定义', names, character_number)
    else:
        print("词库序号输入错误")

    # 检查名字是否存在并添加性别
    if validate:
        print(">>正在验证姓名词库...")
        if source != 0:
            names = get_intersect(names, exist_name)

    print(">>正在筛选名字...")
    return names
Пример #2
0
 def segment(self):
     jieba.enable_paddle()
     for s in self.sentences_list:
         words = jieba.cut(s, use_paddle=True)
         for word in words:
             if word in self.word_count_dic:
                 self.word_count_dic[word] += 1
             else:
                 self.word_count_dic[word] = 1
 def __init__(self):
     jieba.enable_paddle()
     f = open("data/stop_words_utf-8.txt", mode='r',
              encoding='utf-8')
     self.stop_words = [line.strip() for line in f.readlines()]
     f.close()
     print("停用词加载成功")
     self.classifier = fasttext.load_model(
         'model/data_dim100_lr00.5_iter5.model')
     print("模型加载成功")
 def __init__(self, key_to_cut:str, dic:str=None, userdict:str=None):
     
     if dic is not None:
         jieba.set_dictionary(dic)
     
     if userdict is not None:
         jieba.load_userdict(userdict)
     
     self.key_to_cut = key_to_cut
     
     jieba.enable_paddle() #將paddle 功能開啟
Пример #5
0
def readfile():
    jieba.enable_paddle()
    csvFile = open("weibo.csv", "r", encoding='UTF-8')
    reader = csv.reader((line.replace('\0','') for line in csvFile))
    result = []
    for item in reader:
        # 忽略第一行
        if reader.line_num == 1:
            continue
        result.append(item[1:])
    csvFile.close()
    return result
Пример #6
0
    def segment(self):
        """词性标注"""
        import jieba
        jieba.enable_paddle()
        seg_count: Dict[str, int] = {}
        for word, seg in jieba.posseg.cut(self._text, use_paddle=True):

            if seg in seg_count:
                seg_count[seg] += 1
            else:
                seg_count[seg] = 1
        return seg_count
 def _paddle_cut(self, corpus):
     '''paddle mode
     '''
     #enable paddle
     jieba.enable_paddle() 
     
     out = []
     for single_review in corpus[self.key_to_cut]:
         out.append([word for word in JiebaCutingClass.cut_single_sentence(single_review, use_paddle=True)])
     
     corpus['cut'] = out#out[]放入corpus且命名為cut
     
     return corpus
Пример #8
0
def preprocess_with_stopwords():
    print("started to filter and split corpus with stop words")
    jieba.enable_paddle()
    for name in names:
        jieba.suggest_freq(name, tune=True)

    print("started to loading stop words dictionary")
    stop_words = {'的'}
    with open('stopwords.txt', encoding='utf-8') as f:
        while True:
            stop_word = f.readline()
            if stop_word == '':
                break

            stop_word = stop_word.strip()
            stop_words.add(stop_word)
    print("stop words dictionary has been loaded")

    for name in stop_words:
        jieba.suggest_freq(name, tune=True)

    with codecs.open("corpus_without_stopword.txt", 'w', 'utf-8') as standard:
        standard.seek(0)
        standard.truncate()

        for novel in os.listdir('resources/'):
            path = 'resources/' + novel
            print("novel " + novel + " start loading")

            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = re.split("(。|!|\!|\.|?|\?)", text)
                print("there are " + str(len(sentences)) +
                      " sentences in this novel")

                new_sents = []
                for i in range(int(len(sentences) / 2)):
                    sent = sentences[2 * i] + sentences[2 * i + 1]
                    new_sents.append(remove_punctuation(sent))

                for sent in new_sents:
                    if sent != '':
                        split_sentence = jieba.cut(sent, use_paddle=True)
                        out_sentence = ''
                        for word in split_sentence:
                            if word not in stop_words:
                                out_sentence += word_mapping(word)
                                out_sentence += ' '
                        standard.write(out_sentence + '\n')
            print("novel " + novel + " finished")
Пример #9
0
 def partOfSpeechAnalysis(self, sentences):
     pos_lists = []
     jieba.enable_paddle()
     for sentence in sentences:
         sentence = sentence.replace(',', '')
         sentence = sentence.replace('。', '')
         sentence = sentence.replace('?', '')
         sentence = sentence.replace('!', '')
         sentence = sentence.replace('、', '')
         pos_list = []
         for x in psg.cut(sentence, use_paddle=True):
             pos_list.append([x.word, x.flag])
         pos_lists.append(pos_list)
     return pos_lists
Пример #10
0
def get_clerk(lines: List[str]) -> str:
    import jieba
    import jieba.posseg as pseg
    jieba.enable_paddle()
    for line in reversed(lines):
        line = re.sub(r'[ \s+]', '', line)
        # if '书记员' in line:
        if re.search(r'书记[员|长]', line) is not None:
            # line = re.sub(r'[ \s+]', '', line)
            clerk = re.split(r'书记[员|长]', line)[1]
            seg_list = pseg.cut(clerk, use_paddle=True)
            for seg in seg_list:
                if seg.flag == 'PER' or seg.flag == 'nr':
                    clerk = re.sub(r'[,,::;。、"]', '', seg.word)
                    return clerk
    return 'Not found'
Пример #11
0
def init_dict(dictfile='tc_min.dict'):
    # jieba 加载自定义词典
    jieba.load_userdict(dictfile)
    jieba.enable_paddle()

    # 加载词频数据并返回
    domain = int(2**31 - 1)
    freq_dict = {}
    with open(dictfile, 'r', encoding='utf8') as f:
        for line in f:
            segs = line.split(' ')
            token = segs[0]
            freq = int(segs[1])
        freq_dict[token] = float(freq / domain)

    return freq_dict
Пример #12
0
 def word_cut(self):
     jieba.enable_paddle()
     df = self.getCompanyData()
     df = df[["business_scope"]]
     arr = np.array(df)
     stopwords = self.stopwordslist("nlp/stop_words.txt")
     with open("nlp/cut_words.txt", "w+", encoding='utf-8') as fw:
         for i in range(len(arr)):
             list = self.removePunctuation(arr[i][0])
             item = jieba.cut(list, use_paddle=True)  # 使用paddle模式
             santi_words = [
                 x for x in item if len(x) > 1 and x not in stopwords
             ]
             fw.writelines(santi_words)
             fw.write("\r\n")
             print(santi_words)
Пример #13
0
    def __init__(self, tweet_text, user_dict, stopword):
        self.tweet_text = tweet_text
        self.user_dict = user_dict
        self.stopword = stopword
        
        output = ' '.join(self.tweet_text)
        jieba.enable_paddle()
        jieba.re_han_default = re.compile(r"([\u4E00-\u9FD5a-zA-Z0-9+#&\._% ]+)", re.U)
        jieba.load_userdict(self.user_dict)

        text = self._remove_urls(output)
        text = self._remove_at(text)
        for hashtag in self._get_hashtags(text):
            jieba.add_word(hashtag)

        segs = "/".join(jieba.cut(text))
        self._wordlist = self._strip_word(segs)
Пример #14
0
def extract_keyword():
    word, weight = cal_TF_IDF('./Resource/TitleData.txt')
    X, Y = weight.shape
    word_weight = []
    top_k = 3
    for i in range(X):
        temp = []
        top_k_Y = np.argsort(-weight[i])[0:top_k]
        for j in top_k_Y:
            temp.append(word[j])
        word_weight.append(temp)

    #tsne算法进行降维
    tsne = TSNE(n_components=2)
    decomposition_data = tsne.fit_transform(weight)
    label = K_Means(decomposition_data)

    #浏览记录总数
    alllen = len(label)
    cataword = [[], [], [], [], [], []]
    for i in range(alllen):
        if label[i] == 0:
            cataword[0].append(word_weight[i])
        elif label[i] == 1:
            cataword[1].append(word_weight[i])
        elif label[i] == 2:
            cataword[2].append(word_weight[i])
        elif label[i] == 3:
            cataword[3].append(word_weight[i])
        elif label[i] == 4:
            cataword[4].append(word_weight[i])
        elif label[i] == 5:
            cataword[5].append(word_weight[i])
        else:
            break

    enable_paddle()  #加载词性字典
    for i in range(6):
        with open('./Resource/KMeansData/{}.txt'.format(i),
                  'w',
                  encoding='utf-8') as f:
            for line in cataword[i]:
                for word in line:
                    flag = posseg.lcut(word, use_paddle=True)[0].flag
                    if 'n' in flag:
                        f.write(word + ',')
Пример #15
0
    def _preprocess(self,text_input):
    #input: text (Mainly Chinese) : str
    #output: 0. original text: str
    #        1. cut text: list
    #        2. slot seq: list
        if self.paddle_mode :
            jieba.enable_paddle() #启动paddle模式。 0.40版之后开始支持,早期版本不支持
        words = pseg.cut(text_input,use_paddle=paddle_mode) #paddle模式
        
        ret2 = []
        ret3 = []
        for word, flag in words:
            ret2.append(word)
            ret3.append(flag)

        
        return text_input,ret2,ret3
Пример #16
0
    def parse_cv(sentence, flag_list):
        # 手动初始化,不调用时,使用延迟加载
        jieba.initialize()

        # 启动paddle模式
        jieba.enable_paddle()

        # 词性标注
        content = {}
        cv = {}
        for w, f in pseg.cut(sentence, use_paddle=True):
            content[w] = f

            # 提取简历要素
            if f in flag_list:
                cv[w] = f

        print("\n词性标注:", content)
        return cv
Пример #17
0
def tokenize(name_file_to_tokenize, name_tokenized_file='Tokenized_word.txt'):
    ''' 
        This function takes a chinese document and returns a file that contains words in the document
        Input: Name of the file to Tokenize, 
        Output: Tokenized word document
    '''
    # LOADING THE DOCUMENT AS STRING #
    document_text = open(name_file_to_tokenize, 'r', encoding='utf8')
    text_string = document_text.read()
    # TOKENIZING CHARECTERS #
    jieba.enable_paddle()
    seg_list = list(jieba.cut_for_search(
        text_string))  # cut_all=True finds all possible combination
    # ADDING NEW LING AFTER EACH WORD #
    tokenized_words = concat(seg_list, '\n')
    # SAVING AS TEXT FILE #
    text_file_alphabet = open(name_tokenized_file, 'w', encoding='utf8')
    text_file_alphabet.write(listToString(tokenized_words))
    text_file_alphabet.close()
    print(f'Task complete, Tokenized file saved as {name_tokenized_file}')
Пример #18
0
    def partOfSpeech(self):
        pos_lists = []
        jieba.enable_paddle()
        sentence = self.sentence
        sentence = sentence.replace(',', '')
        sentence = sentence.replace('。', '')
        sentence = sentence.replace('?', '')
        sentence = sentence.replace('!', '')
        sentence = sentence.replace('、', '')
        pos_list = []
        for x in psg.cut(sentence, use_paddle=True):
            pos_list.append([x.word, x.flag])
        pos_lists.append(pos_list)
        for pos_list in pos_lists:
            for x in pos_list:
                print(x[0] + "[", end="")
                print(x[1] + "]", end="")
                print("/", end="")

        print("\n")
        pass
Пример #19
0
def makeData():
    jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
    data = loadDataFromExcel('../data/总表6.23-1.xlsx', '区市县派', [6, 7, 8, 11])
    big_class = [
        '环境保护', '市容秩序与广告招牌', '规划执法', '住房执法', '环境卫生', '市政设施', '景观照明与功能性照明', '其他'
    ]
    give_class = {}
    with open('../data/class.txt', 'r') as fc:
        for line in fc.readlines():
            give_class[line.split('\t')[0]] = line.split('\t')[1].strip('\n')
    fina_class = []
    for item in data:
        if item[0] in big_class:
            now_class = '-'.join(item[:3])
            if now_class in give_class:
                class_num = give_class[now_class]
                seg_list = jiebaCut(item[3])
                fina_class.append(''.join(seg_list) + '\t' + class_num)
    with open('../data/cut_strs.txt', 'w') as fc:
        for line in fina_class:
            fc.write(line + '\n')
    print("Done!")
Пример #20
0
def test_jieba(zhaiyao, logdir='./jieba_log.txt'):
    import jieba
    jieba.enable_paddle()
    word_count = {}
    for i in zhaiyao:
        t1 = jieba.cut(i, use_paddle=True)
        for j in t1:
            word_count[j] = word_count.get(j, 0) + 1
            # print(j)
            pass
    # print(word_count)
    # for i in word_count:
    #     print(i,'\t',word_count[i])
    from pandas import Series
    s = Series(word_count)
    s = s.sort_values(ascending=False)
    for i in s.index:
        print(i, '\t', s[i])
        logline = str(i) + '\t' + str(s[i])
        wtlog(logline, logdir)
    # print(s)
    return  # word_count
def pick_specific_type_words(text: list,
                             types: list,
                             paddle=True,
                             stop_words=None):
    """
    挑选出指定词性的词
    :param text: 文档的列表,每一个文档(留言文本)为一个字符串
    :param types: 指定词性标签组成的列表
    :param paddle: 是否打开paddle
    :param stop_words: 停用词表,一般来说不需要加载停用词(因为已经指定了特定的词性,
    如果结果中存在大量停用词可以附上停用词表
    :return: list of list(specific words in a document)
    """
    if paddle:
        jieba.enable_paddle()  # 打开飞桨模式

    stop_words = [] if stop_words is None else stop_words

    # 找出那些指定词性的(且不在停用词表中的)词
    res = [[
        pair[0] for pair in [list(p) for p in pseg.cut(line)]
        if pair[1] in types and pair[0] not in stop_words
    ] for line in text]
    return res
Пример #22
0
class JiebaTokenizer(BaseTokenizer):
    """ 结巴中文分词 """
    jieba.enable_paddle()  # 启动paddle模式
    name = '结巴中文分词'
    tokenizer = jieba

    def __call__(self, corpus: Corpus,  callback: Callable = None):
        if callback is None:
            callback = dummy_callback
        callback(0, "Tokenizing...")
        self.tokenize_sents(corpus)
        return self._store_tokens_from_documents(corpus, callback)

    def _preprocess(self, string):
        return list(filter(lambda x: x != '', self.tokenizer.cut(string, use_paddle=True)))

    def tokenize_sents(self, corpus):
        return [self._preprocess(string) for string in corpus.documents]
    score_file = score_path + data + '_score'
    time_statistic_dir = 'data/time_statistic/'
    time_statistic_path = time_statistic_dir + segment_tool + '_time_statistic.txt'

    mk_dir(seg_dir)
    mk_dir(time_statistic_dir)
    mk_dir(score_path)

    # eval('test_path' + sys.argv[2])字符串转变量
    # eval('seg_path' + sys.argv[2])

    print('test_path=', test_path)
    print('segment_path=', seg_path)

    if segment_tool == 'jieba':
        jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持

    segment_func = segment_tool + '_segment'
    start = time.perf_counter()

    if 'thulac' in segment_func:
        thu = thulac.thulac(seg_only=True)
    if 'hanlp' in segment_func:
        han_tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    with open(test_path, 'r', encoding='utf-8-sig') as f_r:  ##注意,这里的编码,utf-8 bom会在文件头加\ufeff,否则会有小问题
        with open(seg_path, 'w', encoding='utf-8') as f_w:
            for line_sentence in f_r:
                if 'thulac' in segment_func:
                    line_sentence = getattr(Segment(), segment_func)(thu, line_sentence)  # 根据参数调用不同分词工具
                elif 'hanlp' in segment_func:
                    line_sentence = getattr(Segment(), segment_func)(han_tokenizer, line_sentence)
Пример #24
0
import os
import re
from opencc import OpenCC
import nltk
from nltk.tokenize import word_tokenize
import jieba
from zhon.hanzi import punctuation
jieba.enable_paddle()

cc = OpenCC('s2t')
jieba.set_dictionary('dict.txt.small')
en, cn = [], []
with open('cmn.txt', 'r') as f:
    for line in f:
        sentence = re.split('\t', line)
        sentence = list(filter(None, sentence))
        en_sentence = ''
        for word in word_tokenize(sentence[0]):
            en_sentence += word.lower() + ' '
        en.append(en_sentence)
        cn_sentence = ''
        #for word in list(jieba.cut(sentence[1], use_paddle=True)):
        for word in list(jieba.cut(sentence[1])):
            word = re.sub(r'[ \n\t\r]', '', word)
            if word == '':
                continue
            cn_sentence += cc.convert(word) + ' '
        cn.append(cn_sentence)

with open('en.txt', 'w') as f:
    for sentence in en:
Пример #25
0
def networkx_analysis(dir='美的(Midea)JSQ22-L1(Y)_comment_负面.csv'):
    '''
    输入待分析的csv文件路径
    输出networkx网络分析结果
    '''
    pattern = re.compile(r'([^/\\:]+)\.csv')
    data = pattern.search(dir)
    title = data[0]
    title = title.strip('.csv')
    # 初始化
    num=30
    G = nx.Graph()
    plt.figure(figsize=(20,14))
    plt.rcParams['axes.unicode_minus'] = False    # 用来正常显示负号
    plt.rcParams['font.sans-serif'] = ['SimHei']   # 用来正常显示中文标签

    # 读取文件,同时写成txt文件
    fn = pd.read_csv(dir, encoding='utf-8',engine='python') # 打开文件
    string_data = fn['comment'] # 读出评论文件
    print(len(string_data))

    string_all = ''
    # 文本预处理
    pattern = re.compile(u'\t|。|,|:|;|!|)|(|?|、|“|”') # 定义正则表达式匹配模式
    for i in range(len(string_data)):
        string_all += re.sub(pattern, '', string_data[i])

    # string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除

    # 文本分词
    jieba.enable_paddle()
    seg_list_exact = pseg.cut(string_all, use_paddle=True) # 精确模式分词
    object_list = []
    stop_words = []
    file = open('stopwords.txt', 'r', encoding='utf-8').readlines() # 自定义去除词库
    for each_line in file:
        each_line = each_line.strip('\n')
        stop_words.append(each_line)

    for word,flag in seg_list_exact: # 循环读出每个分词
        if word not in stop_words and (flag =='n' or flag =='a' or flag =='vn' or flag =='ad'): # 如果不在去除词库中
            object_list.append(word) # 分词追加到列表

    print('object_list的个数:',len(object_list))
    # 词频统计
    word_counts = collections.Counter(object_list) # 对分词做词频统计
    word_counts_top = word_counts.most_common(num) # 获取最高频的词
    word = pd.DataFrame(word_counts_top, columns=['关键词', '次数'])

    net = pd.DataFrame(np.mat(np.zeros((num,num))),columns=word.iloc[:,0])

    print('==='*30)

    k = 0
    #构建语义关联矩阵
    for i in range(len(string_data)):
        seg_list_exact = jieba.cut(string_data[i], cut_all = False,use_paddle=False) # 精确模式分词
        object_list2 = []
        for words in seg_list_exact: # 循环读出每个分词
            if words not in stop_words: # 如果不在去除词库中
                object_list2.append(words) # 分词追加到列表
        if not len(object_list2)==0:
            word_counts2 = collections.Counter(object_list2)
            word_counts_top2 = word_counts2.most_common(num) # 获取该段最高频的词
            word2 = pd.DataFrame(word_counts_top2)
            word2_T = pd.DataFrame(word2.values.T,columns=word2.iloc[:, 0])

            relation = list(0 for x in range(num))
            # 查看该段最高频的词是否在总的最高频的词列表中
            for j in range(num):
                for p in range(len(word2)):
                    if word.iloc[j,0] == word2.iloc[p,0]:
                        relation[j] = 1
                        break
                    #对于同段落内出现的最高频词,根据其出现次数加到语义关联矩阵的相应位置
            for j in range(num):
                if relation[j] == 1:
                    for q in range(num):
                        if relation[q] == 1:

                            net.iloc[j, q] = net.iloc[j, q] + word2_T.loc[1, word.iloc[q, 0]]
                            net.iloc[q, j] = net.iloc[j, q] + word2_T.loc[1, word.iloc[q, 0]]
    net.to_excel(title + '_net.xls')
    print(net)
    # 处理最后一段内容,完成语义关联矩阵的构建
    max_weight = net.get_values().max()
    # 数据归一化
    for i in range(num):
        for j in range(num):
            net.iloc[i, j] = net.iloc[i, j]/max_weight
            if net.iloc[i,j] < 0.03:
                net.iloc[i,j] = 0
    n = len(word)
    #         # 边的起点,终点,权重
    for i in range(n):
        for j in range(i,n):
            G.add_weighted_edges_from([(word.iloc[i,0],word.iloc[j,0],net.iloc[i,j])])
    nx.draw_networkx(G,
                    pos=nx.circular_layout(G),
    #                 根据权重大小设置线的粗细,可以自行调节线条的粗细,调节边框的颜色,可以调节图的布局
                    width=[float(v['weight']*3) for (r,c,v) in G.edges(data=True)],
                    edge_color='orange',
    #                根据出现的次数,设置点的大小
                   node_size = [float(net.iloc[i,i]*2000) for i in np.arange(20)],
                   node_color='#87CEEB',
                     font_size=15,
                     font_weight='1000',
                     )

    plt.axis('off')
    plt.savefig(title + " _Graph.png", format ="PNG")
Пример #26
0
 def __init__(self):
     super().__init__()
     # self.taggings = ['n', 'PER']  # n: 名词(如:儿子)、PER: 人名(如:贾宝玉)
     self.neo = NeoDB()
     jieba.enable_paddle()
Пример #27
0
def do_task():
    r = connector.get_connection()
    key_list = r.keys()
    group_list = []
    for i in key_list:
        if "chat_content" in i:
            group_list.append(i[:i.find("_")])
    # print(group_list)

    mk = imageio.imread("/root/word_cloud_bot/circle.png")
    # 构建并配置词云对象w,注意要加scale参数,提高清晰度
    w = wordcloud.WordCloud(width=800,
                            height=800,
                            background_color='white',
                            font_path='/root/word_cloud_bot/font.ttf',
                            mask=mk,
                            scale=5)

    for group in group_list:
        try:
            print("当前处理的群组:" + str(group))
            start_time = float(time.time())
            # 生成词云图片
            jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
            words = pseg.cut(r.get("{}_chat_content".format(group)),
                             use_paddle=True)  # paddle模式
            word_list = []
            for word, flag in words:
                # print(word + "\t" + flag)
                if flag in [
                        "n", "nr", "nz", "PER", "f", "ns", "LOC", "s", "nt",
                        "ORG", "nw"
                ]:
                    # 判断该词是否有效,不为空格
                    if re.match(r"^\s+?$", word) is None:
                        word_list.append(word)
            # print(word_list)

            # 分析高频词
            word_amount = {}
            # print(word_amount)
            for word in word_list:
                if word in [
                        "。", ",", "!", ":", "“", "”", "!", ".", ",", ":", '"',
                        "+", "-", "[", "]", "\\", "/"
                ]:
                    continue
                # 判断该词是否之前已经出现
                if word_amount.get(word) is not None:
                    word_amount[word] = word_amount.get(word) + 1
                else:
                    word_amount[word] = 1
            # print(word_amount)
            word_amount = sorted(word_amount.items(),
                                 key=lambda kv: (kv[1]),
                                 reverse=True)
            # print("排序后的热词:" + str(word_amount))
            hot_word_string = ""
            for i in range(min(5, len(word_amount))):
                hot_word_string += "\t\t\t\t\t\t\t\t" + "`" + str(
                    word_amount[i][0]) + "`" + ": " + str(
                        word_amount[i][1]) + "\n"
            # print(hot_word_string)
            # 获取消息总数
            total_message_amount = r.get(
                "{}_total_message_amount".format(group))

            # print("总发言数: " + total_message_amount)

            # 获取发言用户数
            user_amount = len(r.hkeys("{}_user_message_amount".format(group)))
            # 获取所有用户发言数字典
            user_message_amount = r.hgetall(
                "{}_user_message_amount".format(group))
            user_message_amount = sorted(user_message_amount.items(),
                                         key=lambda kv: (kv[1]),
                                         reverse=True)
            # print("排序后的用户:" + str(user_message_amount))
            top_5_user = ""
            for i in range(min(5, len(user_message_amount))):
                top_5_user += "\t\t\t\t\t\t\t\t" + "🎖`" + str(
                    user_message_amount[i][0]) + "`" + " 贡献: " + str(
                        user_message_amount[i][1]) + "\n"
            # print(top_5_user)
            string = " ".join(word_list)
            # 将string变量传入w的generate()方法,给词云输入文字
            w.generate(string)
            # 将词云图片导出到当前文件夹
            w.to_file('{}_chat_word_cloud.png'.format(group))
            bot.send_message(chat_id=group,
                             text="🎤 今日话题榜 🎤\n"
                             "📅 {}\n"
                             "⏱ 截至今天{}\n"
                             "🗣️ 本群{}位朋友共产生{}条发言\n"
                             "🤹‍ 大家今天讨论最多的是:\n\n"
                             "{}\n"
                             "看下有没有你感兴趣的话题? 👏".format(
                                 time.strftime("%Y年%m月%d日", time.localtime()),
                                 time.strftime("%H:%M",
                                               time.localtime()), user_amount,
                                 total_message_amount, hot_word_string),
                             parse_mode="Markdown")

            bot.send_message(chat_id=group,
                             text="🏵 今日活跃用户排行榜 🏵\n"
                             "📅 {}\n"
                             "⏱ 截至今天{}\n\n"
                             "{}\n"
                             "感谢这些朋友今天的分享! 👏 \n"
                             "遇到问题,向他们请教说不定有惊喜😃".format(
                                 time.strftime("%Y年%m月%d日", time.localtime()),
                                 time.strftime("%H:%M", time.localtime()),
                                 top_5_user),
                             parse_mode="Markdown")

            bot.send_photo(chat_id=group,
                           photo=open("{}_chat_word_cloud.png".format(group),
                                      "rb"))

            os.remove("{}_chat_word_cloud.png".format(group))

            stop_time = float(time.time())
            print("当前群组处理耗时:" + str(stop_time - start_time))
        except Exception as e:
            print(e)
            continue
Пример #28
0
async def group_word(context):
    imported_1 = False
    if len(context.parameter) >= 1:
        imported_1 = True
    if not imported:
        try:
            await context.edit("支持库 `jieba` 未安装...\n正在尝试自动安装...")
            await execute(f'{executable} -m pip install jieba')
            await sleep(10)
            result = await execute(f'{executable} -m pip show jieba')
            if len(result) > 0:
                await context.edit('支持库 `jieba` 安装成功...\n正在尝试自动重启...')
                await context.client.disconnect()
            else:
                await context.edit(f"自动安装失败..请尝试手动安装 `{executable} -m pip install jieba` 随后,请重启 PagerMaid-Modify 。")
                return
        except:
            return
    if not imported_ and imported_1:
        try:
            await context.edit("支持库 `paddlepaddle-tiny` 未安装...\n正在尝试自动安装...")
            await execute(f'{executable} -m pip install paddlepaddle-tiny')
            await sleep(10)
            result = await execute(f'{executable} -m pip show paddlepaddle-tiny')
            if len(result) > 0 and not 'WARNING' in result:
                await context.edit('支持库 `paddlepaddle-tiny` 安装成功...\n正在尝试自动重启...')
                await context.client.disconnect()
            else:
                await context.edit(f"自动安装失败,可能是系统不支持..\nAI 分词不可用,切换到基础分词。\n"
                                   f"您可以尝试手动安装 `{executable} -m pip install paddlepaddle-tiny` 。")
                await sleep(4)
        except:
            return
    try:
        await context.edit('正在生成中。。。')
    except:
        return
    if not exists("plugins/groupword"):
        makedirs("plugins/groupword")
    if not exists("plugins/groupword/wqy-microhei.ttc"):
        await context.edit('正在拉取中文字体文件。。。(等待时间请评估你的服务器)')
        r = get('https://cdn.jsdelivr.net/gh/anthonyfok/fonts-wqy-microhei/wqy-microhei.ttc')
        with open("plugins/groupword/wqy-microhei.ttc", "wb") as code:
            code.write(r.content)
    words = defaultdict(int)
    count = 0
    try:
        if imported_ and imported_1:
            try:
                jieba.enable_paddle()
            except:
                imported_1 = False
        async for msg in context.client.iter_messages(context.chat, limit=500):
            if msg.id == context.id:
                continue
            if msg.text and not msg.text.startswith('/') and not msg.text.startswith('-') and not '//' in msg.text:
                try:
                    if imported_ and imported_1:
                        for word in jieba.cut(msg.text.translate(punctuation), use_paddle=True):
                            word = word.lower()
                            words[word] += 1
                    else:
                        for word in jieba.cut(msg.text.translate(punctuation)):
                            word = word.lower()
                            words[word] += 1
                    count += 1
                except:
                    pass
    except:
        if count == 0:
            try:
                await context.edit('您已被 TG 官方限制。')
                return
            except:
                return
    try:
        image = WordCloud(font_path="plugins/groupword/wqy-microhei.ttc", width=800,
                          height=400).generate_from_frequencies(
            words).to_image()
        stream = BytesIO()
        image.save(stream, 'PNG')
    except:
        await context.edit('词云生成失败。')
        return
    try:
        await context.client.send_message(context.chat, f'对最近的 {count} 条消息进行了分析。', file=stream.getvalue())
        await context.delete()
    except:
        return
Пример #29
0
 def __init__(self, extra_dictionary_path=None):
     super().__init__()
     jieba.enable_paddle()
     if extra_dictionary_path is not None:
         jieba.set_dictionary(extra_dictionary_path)
     jieba.initialize()
Пример #30
0
from jieba import posseg
import jieba
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm
import csv
import os
import pickle
from collections import defaultdict
import numpy as np
import datetime
import logging
import jieba
jieba.enable_paddle()  # 激活

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO)
logger = logging.getLogger(__name__)


# 读取文件
def read_file(file_path):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip('\n'))
    return lines