예제 #1
0
def getData(topK=20, withFlag=True, withWeight=False):
    jiebaAnalyse.set_stop_words("../resources/stopWord.txt")
    stopWords = set()
    with open("../resources/stopWord.txt", 'r', encoding='utf8') as f:
        for no, word in enumerate(f):
            stopWords.add(word)

    data = []
    with open(data_file, 'r', encoding='utf8') as f:
        for no, line in enumerate(f):
            data.append(line.strip())
            # data.extend(jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n']))
    # text, weigth = jiebaAnalyse.extract_tags(" ".join(data), topK=20, allowPOS=['n'], withWeight=True)
    # print(text)

    #  test

    allowPOS = frozenset(['n'])
    words = jieba.posseg.dt.cut(" ".join(data))
    idf_freq, median_idf = IDFLoader(None or DEFAULT_IDF).get_idf()
    freq = {}
    word_count = {}
    for w in words:
        if allowPOS:
            if w.flag not in allowPOS:
                continue
            elif not withFlag:
                w = w.word
        wc = w.word if allowPOS and withFlag else w
        if len(wc.strip()) < 2 or wc.lower() in stopWords:
            continue
        freq[w] = freq.get(w, 0.0) + 1.0
        word_count[wc] = word_count.get(wc, 1) + 1
    total = sum(freq.values())
    for k in freq:
        kw = k.word if allowPOS and withFlag else k
        freq[k] *= idf_freq.get(kw, median_idf) / total

    if withWeight:
        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
    else:
        tags = sorted(freq, key=freq.__getitem__, reverse=True)
    if topK:
        res = []
        for w, weigth in tags[:topK]:
            if withWeight:
                res.append((w.word, word_count[w.word]))
            else:
                res.append((w, word_count[w]))

        res = sorted(res, key=itemgetter(1), reverse=True)
        return res
    else:
        return tags
예제 #2
0
def extract_tag():
    ## 分词 ieba_fast
    jieba.load_userdict(dictionary)

    data = []
    jiebaAnalyse.set_stop_words("../resources/stopWord.txt")
    with open(data_file, 'r', encoding='utf8') as f:
        for no, line in enumerate(f):
            data.append(
                jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n']))

    print(data)
예제 #3
0
def wordCould():
    from wordcloud import WordCloud
    data = []
    jiebaAnalyse.set_stop_words(stop_words_path=stop_file)
    with open(data_file, 'r', encoding='utf8') as f:
        for no, line in enumerate(f):
            data.append(line.strip())
            # data.extend(jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n']))
    text = jiebaAnalyse.extract_tags(" ".join(data),
                                     topK=20,
                                     allowPOS=['n'],
                                     withWeight=True)
    # print(text)
    wordcloud = WordCloud(font_path='../resources/simsun.ttf').generate(
        " ".join(text))

    import matplotlib.pyplot as plt
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
예제 #4
0
from gevent.pywsgi import WSGIServer
from jieba_fast.analyse.tfidf import IDFLoader, DEFAULT_IDF
from constant.const import const
from db.db_mongo import getMongoData
from db.db_mysql import saveDataToDb, clearDbData, findAllShopId, findAllMallId
from util.logger import logger

from config.config import updateConfig, getConfig
from datetime import datetime

dictionary = '../resources/dict.txt'
stop_file = '../resources/stopWord.txt'
data_file = '../resources/data.txt'
app = Flask(__name__)

jiebaAnalyse.set_stop_words(stop_file)
stopWords = set()


# 读取文件
def readFile(filename, mode='r', encoding='utf8'):
    data = []
    with open(filename, mode, encoding=encoding) as f:
        for line in f:
            data.append(line.strip())
        # for no, word in enumerate(f):
        #     data.append(word)
    return data


# 获取结果
예제 #5
0
    count = 0
    for line in open('config/wordsCount.txt'):
        count = int(line)
    return count


# 处理完一批后修正数量
def resetAllCount(nums):
    with open('config/wordsCount.txt', 'w') as f:
        f.write(str(nums))


if __name__ == '__main__':

    # 加载停用词
    analyse.set_stop_words('config/stopWords.txt')

    # 起始索引
    beginNum = getAllCount()

    # 获取微博ID
    ids = json.loads(getAllIdJson())

    # 结束索引
    endNum = len(ids)

    for index, _id in enumerate(ids[beginNum:endNum]):
        weibo = json.loads(getAllByIdJson(_id))
        if 'nick_name' not in weibo: continue
        ID, name, vector, nums = getKeyWords(weibo)
        weiboNum = getWeiboCount()