import re from textblob import TextBlob import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt from resources.database import Database import resources.preprocessing as pre import pandas as pd import resources.color_sheme as color_sheme plt.rcParams.update(plt.rcParamsDefault) """ Get Data """ db = Database() source_data = db.get_all(collection="selected2") data = { "Deutscher Sprachraum": [art for art in source_data if art["language"] == "de"], "Anglo-Amerikanischer Sprachraum": [art for art in source_data if art["language"]=="en"] } """ Plot Histogram """ #plot en vs de plt.figure(figsize=(10, 10), dpi=222) sent = dict() colors = color_sheme.get_colors_lang()
import matplotlib.pyplot as plt import os import sys sys.path.append("./") from resources.database import Database import resources.color_sheme as color_sheme import pandas as pd import numpy as np db = Database() filename = os.path.basename(__file__).split(".")[0] df1 = pd.DataFrame(db.get_all(collection="article")) dfg1 = df1[["_id", "source", "language"]].groupby( ["source", "language"], as_index=False).count() df2 = pd.DataFrame(db.get_all(collection="date")) dfg2 = df2[["_id", "source", "language"]].groupby( ["source", "language"], as_index=False).count() df3 = pd.DataFrame(db.get_all(collection="selected2")) dfg3 = df3[["_id", "source", "language"]].groupby( ["source", "language"], as_index=False).count() xlabel = ["en - roh","en - zeitlich", "en - inhaltlich","de - roh","de - zeitlich", "de - inhaltlich"] store = dict() for index, row in dfg1.iterrows(): if row["language"] == "en": store[row["source"]] = [row["_id"], dfg2.loc[index]["_id"], dfg3.loc[index]["_id"],0,0,0] else:
section_result["mid"]["t"] + section_result["first"]["t"] * weight sentiments["textblob_ansatz3"] = section_sent/(weight*2+1) section_sent = section_result["first"]["v"] * weight + \ section_result["mid"]["v"] + section_result["first"]["v"] * weight sentiments["vader_ansatz3"] = section_sent/(weight*2+1) except Exception as e: sentiments["vader_ansatz3"] = sentiments["vader_ansatz1"] return sentiments """ Get relevant Data from database """ db = Database() source_data = db.get_all(collection="selected2") """ Calculate Sentiment and store values """ data = {"Anglo-Amerikanischer Sprachraum": list(), "Deutscher Sprachraum": list()} for art in source_data: art["sentiment"] = calc_sent(art)["vader_ansatz3"] db.update_article(collection="selected2",data=art) if art["language"] == "en": data["Anglo-Amerikanischer Sprachraum"].append(art) else: data["Deutscher Sprachraum"].append(art)
import matplotlib.pyplot as plt import os import sys sys.path.append("./") from resources.database import Database import pandas as pd import numpy as np import resources.color_sheme as color_sheme plt.rcParams.update(plt.rcParamsDefault) db = Database() filename = os.path.basename(__file__).split(".")[0] all_articles = db.get_all(collection="article") df = pd.DataFrame(all_articles) dfg = df[["_id", "source", "search-term"]].groupby(["source", "search-term"], as_index=False).count() sources = dfg["source"].unique() terms = dfg["search-term"].unique() store = dict() for term in terms: result = list() for source in sources: if source in dfg.loc[dfg["search-term"] == term]["source"].to_list(): count = int(dfg.loc[(dfg["source"] == source) & (dfg["search-term"] == term)]["_id"]) result.append(count) else: result.append(0) store[term] = result
import sys sys.path.append("./") import pyLDAvis.gensim import gensim from gensim import corpora from resources.database import Database import preprocessing as pre import numpy as np import random from datetime import datetime """ Get data from database """ db = Database() source_data = db.get_all(collection="date") """ Do some preprocessing """ text = [i["lda"] for i in source_data] text_data = [t.split(" ") for t in text] dictionary = corpora.Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] #load optimal model ldamodel = gensim.models.ldamodel.LdaModel.load("./data_selection/lda_models/lda_t22_07072020_103348.model") #enter all selected topics from survey selected_topics = [3, 18, 19, 20] document_topics = [ldamodel.get_document_topics(item) for item in corpus]
sys.path.append("./") import pyLDAvis.gensim import gensim from gensim import corpora from resources.database import Database import resources.preprocessing as pre import numpy as np import pandas as pd import random from datetime import datetime import resources.color_sheme as color_sheme colors = color_sheme.get_colors_lang() db = Database() topic_names = pd.read_csv("resources/interpretation.csv")["Ergebnis"].to_list() source_data = db.get_all(collection="date") text = [i["lda"] for i in source_data] text_data = [t.split(" ") for t in text] dictionary = corpora.Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] ldamodel = gensim.models.ldamodel.LdaModel.load("data_selection/lda_models/lda_t22_07072020_103348.model") selected_topics = [3, 18, 19, 20] document_topics = [ldamodel.get_document_topics(item) for item in corpus] suretopics = list() for doc in document_topics: for topic in doc: if topic[1] > 0.3: