def app(): conn = create_connection('qna_data.db') st.title('GoldenRetriever') st.header( 'This front end application allows you to easily know about what the employees are talking about!' ) st.markdown( 'View the source code [here](https://github.com/aimakerspace/goldenretriever)!' ) st.markdown( 'Visit our [community](https://makerspace.aisingapore.org/community/ai-makerspace/) and ask us a question!' ) df = pd.read_sql_query("SELECT * FROM userinput", conn) st.header('Database Preview') st.write(df) st.markdown('# Word Cloud') # Create stopword list: stopwords = set(STOPWORDS) word_cloud_data = " ".join(query for query in df.query_str) # Generate a word cloud image wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(word_cloud_data) word_counts = WordCloud().process_text(word_cloud_data) print(word_counts) word_count_df = pd.DataFrame([[k, v] for k, v in word_counts.items()], columns=['Unique Word', 'Occurrence']) # Display the generated image: plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() st.pyplot() st.markdown('# Table Format of WordCloud') st.write(word_count_df)
def _word_cloud(self, code): tmp = code.decode('u8') # 获取词云文本内容text text = re.search("72F9882144A13C12(.*)72F9882144A13C12", tmp, re.S).group(1) # 获取词云词频freq freq = WordCloud().generate(text).words_ tmp_word_count = 1 for k, v in freq.items(): if v in [1, 1.0]: tmp_word = k tmp_word_count = text.count(tmp_word) for k, v in freq.items(): freq[k] = round(v * tmp_word_count) # 剔除词云标记 # data1 = tmp.strip("\n").strip('72F9882144A13C12') data = tmp.replace('72F9882144A13C12' + text + '72F9882144A13C12\n', '') return data, freq
def find_most_fequent_word(dataset, top_n_words, filename): wc = WordCloud(width=800, height=800, background_color='white', stopwords=set(STOPWORDS), min_font_size=10).process_text(dataset) #print(wc) if top_n_words == 0 or top_n_words > len(wc): top_n_words = len(wc) with open('words/' + filename + '.txt', 'w') as file: for k, v in sorted(wc.items(), key=lambda item: item[1], reverse=True)[:top_n_words]: file.write(str(k) + "," + str(v) + "\n")
def image_generation(text, ha): stopwords = ("THE", "The") freq = WordCloud(stopwords=stopwords, regexp=r"\S[\S']+").process_text(text) # force regularize for k, v in freq.items(): if v >= 6: freq[k] = 6 wc = WordCloud( font_path=path.join(getcwd(), ".fonts/ipaexg.ttf"), width=1200, height=630, colormap="cool", stopwords=stopwords, regexp=r"\S[\S']+", ).generate_from_frequencies(freq) image = wc.to_image() image.save(f"/tmp/{ha}.png", format="png", optimize=True)
def SearchKeywordsByConfig(sqllist, maxwords): cachedstopwords = open(Paths.textPath + 'stopwords.txt').read() stopwords = cachedstopwords.split('\n') global cursor, cnn keyword = [] for sql in sqllist: try: cnn = mysql.connector.connect(**DbConfig.newsDataConfig) cursor = cnn.cursor(buffered=True) cursor.execute(sql) keyword.append(cursor.fetchall()) except Error as e: print BackColors.WARNING + "error" + BackColors.ENDC print e finally: cursor.close() cnn.close() tokens = re.findall(r"[\w']+", " ".join(' '.join(elem) for elem in keyword[0])) lowerToken = [word.lower() for word in tokens] filteredToken = [word for word in lowerToken if word.strip() not in stopwords] w = WordCloud().process_text(" ".join(filteredToken)) w = sorted(w.items(), reverse=True, key=lambda x: x[1]) if maxwords == '0': mx = 10 elif maxwords == '1': mx = 20 elif maxwords == '2': mx = 50 elif maxwords == '3': mx = 100 elif maxwords == '4': mx = 200 elif maxwords == '5': mx = 500 else: mx = 200 return w[:mx]
def get_trending(offset): conn = sqlite3.connect('data/newsdb.db') sql = ''' SELECT tokenizer_content FROM article; ''' rows = conn.execute(sql, ) sentences = [] for r in rows: sentences.append(r) cloud = np.array(sentences).flatten() fig = plt.figure(figsize=(30, 20)) # stop_words = stop_words + list(STOPWORDS) stop_words = [] with codecs.open("StopWord/stopword.txt", 'r', encoding='utf8') as file_in: for line in file_in: stop_words.append(line.strip()) for n in stop_words: STOPWORDS.add(n) word_cloud = wordcloud.WordCloud(stopwords=STOPWORDS, max_words=700, background_color="white", width=1000, height=400, mode="RGB").generate( str(cloud)).to_image() # convert word-clou image to base64 img = io.BytesIO() word_cloud.save(img, "PNG") img.seek(0) img_b64 = base64.standard_b64encode(img.getvalue()).decode() # Send image to Imgur client_id = "3bc58602360427f" headers = {'Authorization': 'Client-ID ' + client_id} data = { 'image': img_b64, 'title': 'word cloud image' } # create a dictionary. main_data = urllib.parse.urlencode(data) main_data = main_data.encode('utf-8') request = urllib2.Request(url="https://api.imgur.com/3/upload.json", data=main_data, headers=headers) response = urlopen(request).read() parse = json.loads(response) image_url = parse['data']['link'] t = WordCloud().process_text(str(cloud)) lst_trending_word = sorted(t.items(), key=lambda x: x[1], reverse=True) top10 = lst_trending_word[:10] tmp = [] for i in top10: tmp.append(i[0]) sql2 = ''' SELECT * FROM article WHERE tokenizer_content LIKE ? ORDER BY id DESC LIMIT 20 OFFSET ?''' trending_article = [] for word in tmp: data = conn.execute(sql2, ('%' + word + '%', offset)).fetchall() for i in data: if i[0] not in trending_article: trending_article.append(i) conn.close() return trending_article, image_url