Exemplo n.º 1
0
def preprocessing(single_comment):
    """
    这是一个预处理过程,包括分词,去停词,去除数字,去除特殊符号
    :param single_comment: 一条单独的文档(注意:不是整篇大文档,类似于分析购物评论中的一条评论)
    :return: [['单词', '单词', '单词',...], ['单词', '单词', '单词',...], ['', '', '',...],......]
    """
    jieba.load_userdict('D:\Pycharm\PycharmProjects\Class/jieba_dict/dict.txt')
    jieba.load_userdict(
        'D:\Pycharm\PycharmProjects\Class/jieba_dict/coal_dict.txt')
    jieba.load_userdict(
        'D:\Pycharm\PycharmProjects\Class/jieba_dict/user_dictionary.txt')
    comment0 = re.sub('\u3000', '', single_comment)  # 去掉一些字符,例如\u3000
    comment1 = re.sub(r'&[a-z]*', '', comment0)
    comment2 = re.sub(r'\ufffd', '', comment1)
    comment3 = re.sub('\u3000', '', comment2)
    comment4 = re.sub(
        r'\d+MM|\d+mm|\d+CM|\d+cm|\d+V|\d+v|\d+A|\d+m|\d+M|\d+w|\d+W', 'param',
        comment3)
    comment5 = re.sub(r'\d+\.\d+|\d+', 'num', comment4)
    comment6 = SnowNLP(comment5).han
    comment7 = re.sub(r'博世|博士|Bosch|BOSCH|bosch', '博世', comment6)
    comment8 = re.sub(r'小威|WORX|威克士|worx|wx|WX', '威克士', comment7)
    comment2words = jieba.__lcut(comment8)
    stop_words = open('./stop_words.txt', 'r').readlines()
    for i in range(comment2words.__len__())[::-1]:
        if comment2words[i] in stop_words:  # 去除停用词
            comment2words.pop(i)
        elif comment2words[i].isdigit():
            comment2words.pop(i)
    return comment2words
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r') as f:
        for line in f:
            line = line.strip().split("\t")
            en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
            cn.append(["BOS"] + [c for c in jieba.__lcut(line[1])] + ["EOS"])
    return en, cn
Exemplo n.º 3
0
def Get_cloud(list):
    #list = Get_txt()
    with open('test.txt', 'w') as file:
        for i in range(0, len(list)):
            y = str(list[i])
            # print(jieba.__lcut(y))
            for j in range(0, len(jieba.__lcut(y[16:-7]))):
                if len(jieba.__lcut(y[16:-7])[j]) > 1 and Check_word(
                        jieba.__lcut(y[16:-7])[j]) == 0:
                    file.write(jieba.__lcut(y[16:-7])[j] + " ")
            file.write("\n")

    with open('test.txt', 'r') as sentence:
        # print(type(sentence.read()))
        wc = wordcloud.WordCloud(font_path='STXINGKA.TTF',
                                 width=671,
                                 height=400,
                                 prefer_horizontal=0.8,
                                 max_words=50)
        wc.generate(sentence.read())
        wc.to_file("cloud.png")
Exemplo n.º 4
0
def cut(sentence):
    """
    分词,去除停用词

    :param sentence: 评论
    :return: 词列表
    """
    jieba.load_userdict('./jieba_dict/user_dictionary.txt')
    with open('./stopwords.txt', 'r', encoding='UTF-8') as input_file:
        stopwords = input_file.readlines()
    words = []
    for word in jieba.__lcut(sentence):
        if word not in stopwords:
            words.append(word)
    return words
def lda(lines,stopwords):
    """lda主题"""
    sentences = []
    for line in lines:
        try:
            text = line[1].replace("\n", "").replace(" ", "").replace("\t", "")
            segs = jieba.__lcut(text)
            segs = filter(lambda x: len(x) > 1, segs)
            segs = [seg for seg in list(segs) if seg not in stopwords]
            sentences.append(segs)
        except Exception as e:
            print(e)

    # 词袋模型
    dictionary = corpora.Dictionary(sentences)
    corpus = [dictionary.doc2bow(_sentence) for _sentence in sentences]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)

    # 主题模型打印
    print(lda.print_topics())
    wors={}
    for topic in lda.print_topics():
        words=topic[1].split("+")
        for word in words:
            ss=[ii.replace(" ","").replace("\"","") for ii in word.split("*")]
            print(wors.get(ss[1],0),ss[0],wors.get(ss[1],0)+float(ss[0]))
            wors[ss[1]]=wors.get(ss[1],0)+float(ss[0])
            # print(ss)
    wors={x:float('%.3f'%y) for x,y in wors.items()}

    # 合并词
    data_dic = {'count': wors}
    data_df = pd.DataFrame(data_dic)
    data_df = data_df.reset_index().sort_values(by=["count"], ascending=False)
    print(data_df[:10]["index"])
    print(data_df[:10].index)
    print(data_df[:10]["count"])

    number = numpy.array(data_df[:10]["count"].values*1000)
    work_type = data_df[:10]["index"].values


    labels = tuple(work_type)
    fracs = number

    print(labels)
    plt.pie(x=fracs, labels=labels, autopct='%.0f%%')  # autopct显示百分比
    plt.show()
def word_count(lines,stopwords):
    # 词频统计
    segment = []
    for line in lines:
        try:
            text = line[1].replace("\n", "").replace(" ", "").replace("\t", "")
            segs = jieba.__lcut(text)
            for seg in segs:
                if len(seg) > 1 and seg != '\r\n' and seg not in stopwords:
                    segment.append(seg)
            # print(segment)
        except Exception as e:
            print(e)

    words_df = pd.DataFrame({'segment': segment})
    words_stat = words_df.groupby(by=['segment'])['segment'].agg(["size"])
    words_stat = words_stat[1300:]
    words_stat = words_stat.reset_index().sort_values(by=["size"], ascending=False)
    print(words_stat[:1500])
    wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(1500).values}
    wordcloud = wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.show()
Exemplo n.º 7
0
def test2word(value):
    if isinstance(value, str) == True:
        newWord = jieba.__lcut(value)
    return newWord
Exemplo n.º 8
0
m = re.sub("[\!\%\[\]\,\。\()\-\~]", " ", s)
'''  
提取数字
'''
num = re.sub("\D", " ", m)
''' 
提取文字
'''
b = re.sub("\d", " ", m)
# print(b.strip())
# print(b.lstrip())
# print(list(b.strip()))
c = b.strip()
newlist = []

new = jieba.__lcut(c)
# print(type(new))
print(new)
"""  
不可使用for循环删除空格,如果尾向也是空格则无法删除
需使用:
1, while '' in test:
        test.remove('')
2, mytest = [i for i in test if i != '']
"""
# mytest = [i for i in new if i != ' ']
# print(mytest)

for i in new:
    if i != ' ':
        newlist.append(i)
Exemplo n.º 9
0
"""
import jieba

content = "现如今,机器学习和深度学习带动人工智能飞速的发展,并在图片处理、语音识别领域取得巨大成功。"
# 精准分词
segs_1 = jieba.cut(content, cut_all=False)
print("*".join(segs_1))
# 全模式分词
segs_2 = jieba.cut(content, cut_all=True)
print("*".join(segs_2))

# 搜索引擎模式分词
segs_3 = jieba.cut_for_search(content)
print("*".join(segs_3))
# 封装成列表返回
segs_5 = jieba.__lcut(content)
print(segs_5)

# 获取词性
import jieba.posseg as psg

print([(x.word, x.flag) for x in psg.__lcut_internal(content)])

# count 词出现的次数
from collections import Counter

top5 = Counter(segs_5).most_common(5)
print(top5)

#
text = "铁甲网是中国最大的工程机械交易平台。"