Пример #1
0
def init_df():
    df = load_to_spark.init_article_hotspot_df(filenames)
    df = df.where(col("author").isNotNull())
    df_bots = df.where(col("author").rlike("|".join(bots)))
    df = df.subtract(df_bots)
    print(df.count())
    return df
Пример #2
0
def init_dataframes(filenames):
    df = load_to_spark.init_article_hotspot_df(filenames)
    df = df.where(col("author").isNotNull())
    dfbots = df.where(col("author").rlike("|".join(bots)))
    df = df.subtract(dfbots)
    df_category = load_to_spark.create_category_df()
    df_t_id = df.select("title", "id")
    df_category = df_category.join(df_t_id, "id").select("title", "category")
    return df, df_category
Пример #3
0
def draw_histogram(df):
    fig, axes = plt.subplots()
    fig.set_size_inches(20, 20)
    hist(axes, [df], bins=20, color=['red'])
    plt.savefig(
        "/scratch/wikipedia-dump/plots/hotspots/article_category_hotspot_jaccard.png"
    )


#for i in range(1, 6):
#    filenames.append(base_path + str(i) + ".json")

filenames.append(base_path + "11.json")

#revID|author|timestamp|title
df_hot = load_to_spark.init_article_hotspot_df(filenames)
#title|window|rev_count
df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot)

#id|category
df_categories = load_to_spark.create_category_df(
    "/scratch/wikipedia-dump/categorylinks.json")
#id|title|author|authorID|editTime
df = load_to_spark.main_init_df(filenames)
df = df.select("title", "author",
               col("editTime").alias("timestamp"),
               col("id").alias("id1")).distinct()

#title|author|category|timestamp
df_joined = df.join(df_categories, col("id") == col("id1")).drop("id1", "id")
df_joined.count()
Пример #4
0
def init_df():
    df = load_to_spark.init_article_hotspot_df(filenames)
    return df