def draw_histogram(df1, df2): plt.rcParams.update({}) fig, axes = plt.subplots() axes.set_yscale("log") axes.set_xlabel("Anzahl der Hotspots") axes.set_ylabel("Anzahl der Artikel") hist(axes, [df1, df2], bins=100, color=["red", "blue"]) plt.savefig( "/scratch/wikipedia-dump/plots/hotspots/hotspot_weekly_vs_monthly.png") for i in range(1, 27): filenames.append(base_path + str(i) + ".json") #filenames.append(base_path + "1.json") df = init_df(filenames) df_h_weekly = hotspot_detection.sliding_window_hotspots_by_time( df, window_size="1") df_h_monthly = hotspot_detection.sliding_window_hotspots_by_time( df, window_size="4") df_g1 = df_h_weekly.groupBy(col("title")).count() df_g2 = df_h_monthly.groupBy(col("title")).count() df_h1 = df_g1.select(col("count")) df_h2 = df_g2.select(col("count")) draw_histogram(df_h1, df_h2)
df_hot = load_to_spark.init_article_hotspot_df(filenames) df_hot = df_hot.where(col("author").isNotNull()) df_h_bots = df_hot.where(col("author").rlike("|".join(bots))) df_hot = df_hot.subtract(df_h_bots) #load main df df = load_to_spark.main_init_df(filenames).select( "author", "title", col("editTime").alias("timestamp")) df = df.where(col("author").isNotNull()) df_bots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(df_bots) df.show() df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot).select( "window", col("title").alias("title1")) df_hotspots.show() df_joined = df.join(df_hotspots, (col("title") == col("title1")) & (col("timestamp").between(col("window")["start"], col("window")["end"])))\ .select("author", "title", "window").distinct() df_grouped = df_joined.groupBy(col("author")).count() df_hist = df_grouped.select(col("count")) df_hist = df_hist.where(col("count") < 10000) draw_histogram( df_hist, "Vorkommen in Artikelhotspots", "Anzahl der Autoren", "/scratch/wikipedia-dump/plots/hotspots/author_occurrences_per_hotspot.png" ) different_articles_per_author(df_joined)
df = df.subtract(df_bots) return df def draw_histogram(df): plt.yscale('log') plt.rcParams.update({'font.size': 28}) fig, axes = plt.subplots() fig.set_size_inches(20, 20) axes.set_yscale("log") axes.set_xlabel("Anzahl der Hotspots") if type == "title": axes.set_ylabel("Anzahl der Titel") elif type == "author": axes.set_ylabel("Anzahl der Autoren") hist(axes, [df], bins=20, color=["red"]) plt.savefig("/scratch/wikipedia-dump/plots/hotspots/" + type + "_hotspot_count.png") print("loading files") df = init_df() print("calculating hotspots") df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df, type=type) print("counting hotspots") df_grouped = df_hotspots.groupBy(col(type)).count() print("selecting needed columns") df_hist = df_grouped.select("count") print("drawing histogram") draw_histogram(df_hist)
def draw_histogram(df): plt.rcParams.update({'font.size': 28}) fig, axes = plt.subplots() fig.set_size_inches(20, 20) axes.set_xlabel("Anzahl der Hotspots") axes.set_ylabel("Anzahl der Titel") hist(axes, [df], bins=20, color=["red"]) plt.savefig( "/scratch/wikipedia-dump/plots/jaccard/hotspots_per_title_correlation_author.png" ) df = init_df() df_category = init_category_df(df) df_hotspots = hotspot_detection.sliding_window_hotspots_by_time( df, type="title").select("window", "title") df_hotspots.cache() df_hotspots.show() df_jaccard = jaccard_similarity.jaccard_with_min_hashing( df_hotspots, "title", "window") #window1|window2 df_similar = df_jaccard.where(col("jaccard") < 0.3).select("title1", "title2") df_titles = df_similar.select(col("title1").alias("title")).union( df_similar.select(col("title2").alias("title"))).distinct() df_count = df_hotspots.join(df_titles, "title") df_grouped = df_count.groupBy(col("title")).count() df_hist = df_grouped.select(col("count")) draw_histogram(df_hist)