示例#1
0
def draw_histogram(df1, df2):
    plt.rcParams.update({})
    fig, axes = plt.subplots()
    axes.set_yscale("log")
    axes.set_xlabel("Anzahl der Hotspots")
    axes.set_ylabel("Anzahl der Artikel")
    hist(axes, [df1, df2], bins=100, color=["red", "blue"])
    plt.savefig(
        "/scratch/wikipedia-dump/plots/hotspots/hotspot_weekly_vs_monthly.png")


for i in range(1, 27):
    filenames.append(base_path + str(i) + ".json")

#filenames.append(base_path + "1.json")

df = init_df(filenames)
df_h_weekly = hotspot_detection.sliding_window_hotspots_by_time(
    df, window_size="1")
df_h_monthly = hotspot_detection.sliding_window_hotspots_by_time(
    df, window_size="4")

df_g1 = df_h_weekly.groupBy(col("title")).count()
df_g2 = df_h_monthly.groupBy(col("title")).count()

df_h1 = df_g1.select(col("count"))
df_h2 = df_g2.select(col("count"))

draw_histogram(df_h1, df_h2)
df_hot = load_to_spark.init_article_hotspot_df(filenames)
df_hot = df_hot.where(col("author").isNotNull())
df_h_bots = df_hot.where(col("author").rlike("|".join(bots)))
df_hot = df_hot.subtract(df_h_bots)

#load main df
df = load_to_spark.main_init_df(filenames).select(
    "author", "title",
    col("editTime").alias("timestamp"))
df = df.where(col("author").isNotNull())
df_bots = df.where(col("author").rlike("|".join(bots)))
df = df.subtract(df_bots)
df.show()

df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot).select(
    "window",
    col("title").alias("title1"))
df_hotspots.show()
df_joined = df.join(df_hotspots, (col("title") == col("title1")) & (col("timestamp").between(col("window")["start"], col("window")["end"])))\
    .select("author", "title", "window").distinct()

df_grouped = df_joined.groupBy(col("author")).count()
df_hist = df_grouped.select(col("count"))
df_hist = df_hist.where(col("count") < 10000)
draw_histogram(
    df_hist, "Vorkommen in Artikelhotspots", "Anzahl der Autoren",
    "/scratch/wikipedia-dump/plots/hotspots/author_occurrences_per_hotspot.png"
)

different_articles_per_author(df_joined)
示例#3
0
    df = df.subtract(df_bots)
    return df


def draw_histogram(df):
    plt.yscale('log')
    plt.rcParams.update({'font.size': 28})
    fig, axes = plt.subplots()
    fig.set_size_inches(20, 20)
    axes.set_yscale("log")
    axes.set_xlabel("Anzahl der Hotspots")
    if type == "title":
        axes.set_ylabel("Anzahl der Titel")
    elif type == "author":
        axes.set_ylabel("Anzahl der Autoren")
    hist(axes, [df], bins=20, color=["red"])
    plt.savefig("/scratch/wikipedia-dump/plots/hotspots/" + type +
                "_hotspot_count.png")


print("loading files")
df = init_df()
print("calculating hotspots")
df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df, type=type)
print("counting hotspots")
df_grouped = df_hotspots.groupBy(col(type)).count()
print("selecting needed columns")
df_hist = df_grouped.select("count")
print("drawing histogram")
draw_histogram(df_hist)
def draw_histogram(df):
    plt.rcParams.update({'font.size': 28})
    fig, axes = plt.subplots()
    fig.set_size_inches(20, 20)
    axes.set_xlabel("Anzahl der Hotspots")
    axes.set_ylabel("Anzahl der Titel")
    hist(axes, [df], bins=20, color=["red"])
    plt.savefig(
        "/scratch/wikipedia-dump/plots/jaccard/hotspots_per_title_correlation_author.png"
    )


df = init_df()
df_category = init_category_df(df)
df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(
    df, type="title").select("window", "title")
df_hotspots.cache()
df_hotspots.show()

df_jaccard = jaccard_similarity.jaccard_with_min_hashing(
    df_hotspots, "title", "window")
#window1|window2
df_similar = df_jaccard.where(col("jaccard") < 0.3).select("title1", "title2")
df_titles = df_similar.select(col("title1").alias("title")).union(
    df_similar.select(col("title2").alias("title"))).distinct()
df_count = df_hotspots.join(df_titles, "title")
df_grouped = df_count.groupBy(col("title")).count()

df_hist = df_grouped.select(col("count"))
draw_histogram(df_hist)