def app():
    conn = create_connection('qna_data.db')
    st.title('GoldenRetriever')
    st.header(
        'This front end application allows you to easily know about what the employees are talking about!'
    )
    st.markdown(
        'View the source code [here](https://github.com/aimakerspace/goldenretriever)!'
    )
    st.markdown(
        'Visit our [community](https://makerspace.aisingapore.org/community/ai-makerspace/) and ask us a question!'
    )
    df = pd.read_sql_query("SELECT * FROM userinput", conn)
    st.header('Database Preview')
    st.write(df)
    st.markdown('# Word Cloud')
    # Create stopword list:
    stopwords = set(STOPWORDS)
    word_cloud_data = " ".join(query for query in df.query_str)
    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords,
                          background_color="white").generate(word_cloud_data)
    word_counts = WordCloud().process_text(word_cloud_data)
    print(word_counts)
    word_count_df = pd.DataFrame([[k, v] for k, v in word_counts.items()],
                                 columns=['Unique Word', 'Occurrence'])
    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    st.pyplot()
    st.markdown('# Table Format of WordCloud')
    st.write(word_count_df)
示例#2
0
 def _word_cloud(self, code):
     tmp = code.decode('u8')
     # 获取词云文本内容text
     text = re.search("72F9882144A13C12(.*)72F9882144A13C12", tmp,
                      re.S).group(1)
     # 获取词云词频freq
     freq = WordCloud().generate(text).words_
     tmp_word_count = 1
     for k, v in freq.items():
         if v in [1, 1.0]:
             tmp_word = k
             tmp_word_count = text.count(tmp_word)
     for k, v in freq.items():
         freq[k] = round(v * tmp_word_count)
     # 剔除词云标记
     # data1 = tmp.strip("\n").strip('72F9882144A13C12')
     data = tmp.replace('72F9882144A13C12' + text + '72F9882144A13C12\n',
                        '')
     return data, freq
示例#3
0
def find_most_fequent_word(dataset, top_n_words, filename):
    wc = WordCloud(width=800,
                   height=800,
                   background_color='white',
                   stopwords=set(STOPWORDS),
                   min_font_size=10).process_text(dataset)
    #print(wc)

    if top_n_words == 0 or top_n_words > len(wc):
        top_n_words = len(wc)
    with open('words/' + filename + '.txt', 'w') as file:
        for k, v in sorted(wc.items(), key=lambda item: item[1],
                           reverse=True)[:top_n_words]:
            file.write(str(k) + "," + str(v) + "\n")
示例#4
0
def image_generation(text, ha):
    stopwords = ("THE", "The")

    freq = WordCloud(stopwords=stopwords,
                     regexp=r"\S[\S']+").process_text(text)
    # force regularize
    for k, v in freq.items():
        if v >= 6:
            freq[k] = 6
    wc = WordCloud(
        font_path=path.join(getcwd(), ".fonts/ipaexg.ttf"),
        width=1200,
        height=630,
        colormap="cool",
        stopwords=stopwords,
        regexp=r"\S[\S']+",
    ).generate_from_frequencies(freq)
    image = wc.to_image()
    image.save(f"/tmp/{ha}.png", format="png", optimize=True)
示例#5
0
def SearchKeywordsByConfig(sqllist, maxwords):
    cachedstopwords = open(Paths.textPath + 'stopwords.txt').read()
    stopwords = cachedstopwords.split('\n')
    global cursor, cnn
    keyword = []
    for sql in sqllist:
        try:
            cnn = mysql.connector.connect(**DbConfig.newsDataConfig)
            cursor = cnn.cursor(buffered=True)
            cursor.execute(sql)
            keyword.append(cursor.fetchall())
        except Error as e:
            print BackColors.WARNING + "error" + BackColors.ENDC
            print e
        finally:
            cursor.close()
            cnn.close()
    tokens = re.findall(r"[\w']+", " ".join(' '.join(elem) for elem in keyword[0]))

    lowerToken = [word.lower() for word in tokens]

    filteredToken = [word for word in lowerToken if word.strip() not in stopwords]
    w = WordCloud().process_text(" ".join(filteredToken))
    w = sorted(w.items(), reverse=True, key=lambda x: x[1])
    if maxwords == '0':
        mx = 10
    elif maxwords == '1':
        mx = 20
    elif maxwords == '2':
        mx = 50
    elif maxwords == '3':
        mx = 100
    elif maxwords == '4':
        mx = 200
    elif maxwords == '5':
        mx = 500
    else:
        mx = 200
    return w[:mx]
示例#6
0
def get_trending(offset):
    conn = sqlite3.connect('data/newsdb.db')
    sql = '''
        SELECT tokenizer_content FROM article;
    '''
    rows = conn.execute(sql, )
    sentences = []
    for r in rows:
        sentences.append(r)
    cloud = np.array(sentences).flatten()
    fig = plt.figure(figsize=(30, 20))

    # stop_words = stop_words + list(STOPWORDS)
    stop_words = []
    with codecs.open("StopWord/stopword.txt", 'r', encoding='utf8') as file_in:
        for line in file_in:
            stop_words.append(line.strip())
    for n in stop_words:
        STOPWORDS.add(n)
    word_cloud = wordcloud.WordCloud(stopwords=STOPWORDS,
                                     max_words=700,
                                     background_color="white",
                                     width=1000,
                                     height=400,
                                     mode="RGB").generate(
                                         str(cloud)).to_image()
    # convert word-clou image to base64
    img = io.BytesIO()
    word_cloud.save(img, "PNG")
    img.seek(0)
    img_b64 = base64.standard_b64encode(img.getvalue()).decode()
    # Send image to Imgur
    client_id = "3bc58602360427f"
    headers = {'Authorization': 'Client-ID ' + client_id}
    data = {
        'image': img_b64,
        'title': 'word cloud image'
    }  # create a dictionary.
    main_data = urllib.parse.urlencode(data)
    main_data = main_data.encode('utf-8')
    request = urllib2.Request(url="https://api.imgur.com/3/upload.json",
                              data=main_data,
                              headers=headers)
    response = urlopen(request).read()
    parse = json.loads(response)
    image_url = parse['data']['link']

    t = WordCloud().process_text(str(cloud))
    lst_trending_word = sorted(t.items(), key=lambda x: x[1], reverse=True)

    top10 = lst_trending_word[:10]
    tmp = []
    for i in top10:
        tmp.append(i[0])
    sql2 = '''
        SELECT * FROM article WHERE tokenizer_content LIKE ? ORDER BY id DESC LIMIT 20 OFFSET ?'''
    trending_article = []
    for word in tmp:
        data = conn.execute(sql2, ('%' + word + '%', offset)).fetchall()
        for i in data:
            if i[0] not in trending_article:
                trending_article.append(i)
    conn.close()
    return trending_article, image_url