def compute_question_player_counts(proto_log_path): spark = create_spark_session() df = spark.read.json(proto_log_path) df.createOrReplaceTempView('logs') question_player_counts = spark.sql(""" SELECT object.qid, size(collect_set(object.user.id)) AS n_players FROM logs GROUP BY object.qid """).collect() return {r.qid: r.n_players for r in question_player_counts}
def create_wikipedia_title_pickle(dump_path, output_path): from qanta.spark import create_spark_session spark = create_spark_session() wiki_df = spark.read.json(dump_path) raw_titles = wiki_df.select('title').distinct().collect() clean_titles = {normalize_wikipedia_title(r.title) for r in raw_titles} with open(output_path, 'wb') as f: pickle.dump(clean_titles, f) spark.stop()
def create_wikipedia_title_pickle(dump_path, disambiguation_pages_path, output_path): from qanta.spark import create_spark_session with open(disambiguation_pages_path) as f: disambiguation_pages = set(json.load(f)) spark = create_spark_session() wiki_df = spark.read.json(dump_path) rows = wiki_df.select('title', 'id').distinct().collect() content_pages = [r for r in rows if int(r.id) not in disambiguation_pages] clean_titles = {normalize_wikipedia_title(r.title) for r in content_pages} with open(output_path, 'wb') as f: pickle.dump(clean_titles, f) spark.stop()
def create_wikipedia_cache(dump_path): from qanta.spark import create_spark_session spark = create_spark_session() db = QuestionDatabase() answers = set(db.all_answers().values()) b_answers = spark.sparkContext.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH)) def create_page(row): title = normalize_wikipedia_title(row.title) filter_answers = b_answers.value if title in filter_answers: page = WikipediaPage(title, row.text, None, None, row.id, row.url) write_page(page, page_path=page_path) spark.read.json(dump_path).rdd.foreach(create_page)