示例#1
0
import re
from textblob import TextBlob
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from resources.database import Database
import resources.preprocessing as pre
import pandas as pd
import resources.color_sheme as color_sheme
plt.rcParams.update(plt.rcParamsDefault)

"""
Get Data
"""

db = Database()
source_data = db.get_all(collection="selected2")

data = {
    "Deutscher Sprachraum": [art for art in source_data if art["language"] == "de"],
    "Anglo-Amerikanischer Sprachraum": [art for art in source_data if art["language"]=="en"]
    }

"""
Plot Histogram
"""

#plot en vs de
plt.figure(figsize=(10, 10), dpi=222)
sent = dict()
colors = color_sheme.get_colors_lang()
import matplotlib.pyplot as plt
import os
import sys
sys.path.append("./")
from resources.database import Database
import resources.color_sheme as color_sheme
import pandas as pd
import numpy as np

db = Database()
filename = os.path.basename(__file__).split(".")[0]

df1 = pd.DataFrame(db.get_all(collection="article"))
dfg1 = df1[["_id", "source", "language"]].groupby(
    ["source", "language"], as_index=False).count()

df2 = pd.DataFrame(db.get_all(collection="date"))
dfg2 = df2[["_id", "source", "language"]].groupby(
    ["source", "language"], as_index=False).count()

df3 = pd.DataFrame(db.get_all(collection="selected2"))
dfg3 = df3[["_id", "source", "language"]].groupby(
    ["source", "language"], as_index=False).count()

xlabel = ["en - roh","en - zeitlich", "en - inhaltlich","de - roh","de - zeitlich", "de - inhaltlich"]

store = dict()
for index, row in dfg1.iterrows():
    if row["language"] == "en":
        store[row["source"]] = [row["_id"], dfg2.loc[index]["_id"], dfg3.loc[index]["_id"],0,0,0]
    else:
            section_result["mid"]["t"] + section_result["first"]["t"] * weight
        sentiments["textblob_ansatz3"] = section_sent/(weight*2+1)

        section_sent = section_result["first"]["v"] * weight + \
            section_result["mid"]["v"] + section_result["first"]["v"] * weight
        sentiments["vader_ansatz3"] = section_sent/(weight*2+1)
    except Exception as e:
        sentiments["vader_ansatz3"] = sentiments["vader_ansatz1"]

    return sentiments

"""
Get relevant Data from database
"""

db = Database()
source_data = db.get_all(collection="selected2")

"""
Calculate Sentiment and store values
"""

data = {"Anglo-Amerikanischer Sprachraum": list(), "Deutscher Sprachraum": list()}

for art in source_data:
    art["sentiment"] = calc_sent(art)["vader_ansatz3"]
    db.update_article(collection="selected2",data=art)
    if art["language"] == "en":
        data["Anglo-Amerikanischer Sprachraum"].append(art)
    else:
        data["Deutscher Sprachraum"].append(art)
import matplotlib.pyplot as plt
import os
import sys
sys.path.append("./")
from resources.database import Database
import pandas as pd
import numpy as np
import resources.color_sheme as color_sheme
plt.rcParams.update(plt.rcParamsDefault)
db = Database()
filename = os.path.basename(__file__).split(".")[0]

all_articles = db.get_all(collection="article")
df = pd.DataFrame(all_articles)
dfg = df[["_id", "source", "search-term"]].groupby(["source", "search-term"],
                                                   as_index=False).count()
sources = dfg["source"].unique()
terms = dfg["search-term"].unique()

store = dict()
for term in terms:
    result = list()
    for source in sources:
        if source in dfg.loc[dfg["search-term"] == term]["source"].to_list():
            count = int(dfg.loc[(dfg["source"] == source)
                                & (dfg["search-term"] == term)]["_id"])
            result.append(count)
        else:
            result.append(0)
    store[term] = result
import sys
sys.path.append("./")
import pyLDAvis.gensim
import gensim
from gensim import corpora
from resources.database import Database
import preprocessing as pre
import numpy as np
import random
from datetime import datetime

"""
Get data from database
"""
db = Database()
source_data = db.get_all(collection="date")

"""
Do some preprocessing
"""
text = [i["lda"] for i in source_data]
text_data = [t.split(" ") for t in text]
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
#load optimal model
ldamodel = gensim.models.ldamodel.LdaModel.load("./data_selection/lda_models/lda_t22_07072020_103348.model")

#enter all selected topics from survey
selected_topics = [3, 18, 19, 20]
document_topics = [ldamodel.get_document_topics(item) for item in corpus]
示例#6
0
sys.path.append("./")
import pyLDAvis.gensim
import gensim
from gensim import corpora
from resources.database import Database
import resources.preprocessing as pre
import numpy as np
import pandas as pd
import random
from datetime import datetime
import resources.color_sheme as color_sheme

colors = color_sheme.get_colors_lang()


db = Database()
topic_names = pd.read_csv("resources/interpretation.csv")["Ergebnis"].to_list()
source_data = db.get_all(collection="date")
text = [i["lda"] for i in source_data]
text_data = [t.split(" ") for t in text]
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
ldamodel = gensim.models.ldamodel.LdaModel.load("data_selection/lda_models/lda_t22_07072020_103348.model")

selected_topics = [3, 18, 19, 20]
document_topics = [ldamodel.get_document_topics(item) for item in corpus]

suretopics = list()
for doc in document_topics:
    for topic in doc:
        if topic[1] > 0.3: