def run_job(self, sc, sqlc): if self.args.input_parquet: edge_df = sqlc.read.load(os.path.join(self.args.input_parquet, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.input_parquet, "vertices")) else: raise Exception("No input given!") if self.args.output_txt: vertices = sql(sqlc, """ SELECT CONCAT(id, " ", domain) r FROM vertices """, {"vertices": vertex_df}) edges = sql(sqlc, """ SELECT CONCAT(src, " ", dst) r FROM edges """, {"edges": edge_df}) vertices.coalesce(self.args.coalesce).write.text( os.path.join(self.args.output_txt, "vertices"), compression="gzip" if self.args.gzip else "none" ) edges.coalesce(self.args.coalesce).write.text( os.path.join(self.args.output_txt, "edges"), compression="gzip" if self.args.gzip else "none" )
def run_job(self, sc, sqlc): if self.args.input_parquet: edge_df = sqlc.read.load( os.path.join(self.args.input_parquet, "edges")) vertex_df = sqlc.read.load( os.path.join(self.args.input_parquet, "vertices")) else: raise Exception("No input given!") if self.args.output_txt: vertices = sql( sqlc, """ SELECT CONCAT(id, " ", domain) r FROM vertices """, {"vertices": vertex_df}) edges = sql( sqlc, """ SELECT CONCAT(src, " ", dst) r FROM edges """, {"edges": edge_df}) vertices.coalesce(self.args.coalesce).write.text( os.path.join(self.args.output_txt, "vertices"), compression="gzip" if self.args.gzip else "none") edges.coalesce(self.args.coalesce).write.text( os.path.join(self.args.output_txt, "edges"), compression="gzip" if self.args.gzip else "none")
def save_vertex_graph(self, sqlc, df): """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """ vertex_graph_schema = SparkTypes.StructType([ SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("domain", SparkTypes.StringType(), nullable=False) ]) # TODO ?! if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) # We collect all unique domains from the page URLs & destination of all external links d1_df = sql(sqlc, """ SELECT parse_url(url, "HOST") as domain from df """, {"df": df}).distinct() d2_df = sql(sqlc, """ SELECT parse_url(link, "HOST") as domain FROM ( SELECT EXPLODE(external_links.href) as link FROM df ) as pairs """, {"df": df}) all_domains_df = d1_df.unionAll(d2_df).distinct() def iter_domain(record): """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """ domain = record["domain"] if not domain or not domain.strip(): return [] name = URL("http://" + domain).normalized_domain try: _id = _fast_make_domain_id(name) except Exception: # pylint: disable=broad-except return [] return [(long(_id), str(name))] rdd_domains = all_domains_df.rdd.flatMap(iter_domain) vertex_df = createDataFrame(sqlc, rdd_domains, vertex_graph_schema).distinct() if self.args.get("coalesce_vertices") or self.args.get("coalesce"): vertex_df = vertex_df.coalesce( int(self.args.get("coalesce_vertices") or self.args.get("coalesce")) ) vertex_df.write.parquet(os.path.join(self.args["path"], "vertices"))
def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql(sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print(final_df.rdd.collect())
def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql( sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none") else: print(final_df.rdd.collect())
def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): # Get all unique (host1 => host2) pairs domain_pairs = sql(sqlc, """ SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2 FROM ( SELECT url, EXPLODE(external_links.href) as link FROM df ) as pairs """, {"df": df}).distinct() # Format as csv lines = sql(sqlc, """ SELECT CONCAT(d1, " ", d2) as r FROM pairs """, {"pairs": domain_pairs}) self.save_dataframe(lines, "text") return True
def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): lines_df = sql( sqlc, """ SELECT CONCAT(CONCAT_WS(",", SORT_ARRAY(grep_words)), " ", url) r FROM df WHERE size(grep_words) > 0 """, {"df": df}) self.save_dataframe(lines_df, "text") return True
def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): # Get all unique (host1 => host2) pairs domain_pairs = sql( sqlc, """ SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2 FROM ( SELECT url, EXPLODE(external_links.href) as link FROM df ) as pairs """, { "df": df }).distinct() # Format as csv lines = sql( sqlc, """ SELECT CONCAT(d1, " ", d2) as r FROM pairs """, {"pairs": domain_pairs}) self.save_dataframe(lines, "text") return True
def spark_pipeline_action(self, sc, sqlc, df, indexer): domain = self.args["domain"] if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) lines_df = sql(sqlc, """ SELECT CONCAT( regexp_replace(url_to, "^http(s?)://", ""), " ", COUNT(*), " ", CONCAT_WS(" ", COLLECT_LIST(url_from)) ) r FROM ( SELECT url url_from, EXPLODE(external_links.href) url_to FROM df WHERE size(external_links) > 0 ) links WHERE SUBSTRING( PARSE_URL(links.url_to, "HOST"), LENGTH(PARSE_URL(links.url_to, "HOST")) - %s, %s ) == "%s" GROUP BY regexp_replace(url_to, "^http(s?)://", "") ORDER BY COUNT(*) DESC """ % (len(domain), len(domain), domain), {"df": df}) if self.args.get("limit"): lines_df = lines_df.limit(int(self.args["limit"])) if self.args.get("partitions"): lines_df = lines_df.coalesce(int(self.args["partitions"])) lines_df.persist() print "Number of destination URLs: %s" % lines_df.count() if self.args.get("coalesce"): lines_df = lines_df.coalesce(int(self.args["coalesce"])) lines_df.write.text( self.args["path"], compression="gzip" if self.args.get("gzip") else "none" ) return True
def spark_pipeline_action(self, sc, sqlc, df, indexer): lines_df = sql(sqlc, """ SELECT CONCAT(CONCAT_WS(",", SORT_ARRAY(grep_words)), " ", url) r FROM df WHERE size(grep_words) > 0 """, {"df": df}) if self.args.get("coalesce"): lines_df = lines_df.coalesce(int(self.args["coalesce"])) lines_df.write.text( self.args["path"], compression="gzip" if self.args.get("gzip") else "none" ) return True
def hook_spark_pipeline_action(self, sc, sqlc, df, indexer): domain = self.args["domain"] if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) lines_df = sql( sqlc, """ SELECT CONCAT( regexp_replace(url_to, "^http(s?)://", ""), " ", COUNT(*), " ", CONCAT_WS(" ", COLLECT_LIST(url_from)) ) r FROM ( SELECT url url_from, EXPLODE(external_links.href) url_to FROM df WHERE size(external_links) > 0 ) links WHERE SUBSTRING( PARSE_URL(links.url_to, "HOST"), LENGTH(PARSE_URL(links.url_to, "HOST")) - %s, %s ) == "%s" GROUP BY regexp_replace(url_to, "^http(s?)://", "") ORDER BY COUNT(*) DESC """ % (len(domain) - 1, len(domain), domain), {"df": df}) if self.args.get("limit"): lines_df = lines_df.limit(int(self.args["limit"])) if self.args.get("partitions"): lines_df = lines_df.coalesce(int(self.args["partitions"])) lines_df.persist() print("Number of destination URLs: %s" % lines_df.count()) self.save_dataframe(lines_df, "text") return True
def custom_pagerank_2(self, sc, sqlc): """ Alternative PageRank implementation, with fixed number of steps """ sc.setCheckpointDir("/tmp/spark-checkpoints") # ranks_schema = SparkTypes.StructType([ # SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), # SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False) # ]) edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) if self.args.maxedges: edge_df = edge_df.limit(self.args.maxedges) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) if self.args.maxvertices: vertex_df = vertex_df.limit(self.args.maxvertices) sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions)) # TODO: bootstrap with previous pageranks to accelerate convergence? ranks_df = sql(sqlc, """ SELECT id, cast(1.0 as float) rank FROM vertices """, {"vertices": vertex_df}) edge_df.persist() vertex_df.persist() print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count())) iteration_tmpdir = None for iteration in range(self.args.maxiter): new_ranks_df = sql(sqlc, """ SELECT ranks.id id, cast(0.15 + 0.85 * COALESCE(contribs.contrib, 0) as float) rank FROM ranks LEFT OUTER JOIN ( SELECT edges.dst id, cast(sum(ranks.rank * COALESCE(edges.weight, 0)) as float) contrib FROM edges LEFT OUTER JOIN ranks ON edges.src = ranks.id GROUP BY edges.dst ) contribs ON contribs.id = ranks.id """, {"ranks": ranks_df, "edges": edge_df}) # At this point we need to break the RDD dependency chain # Writing & loading Parquet seems to be more efficient than checkpointing the RDD. iteration_tmpdir_previous = iteration_tmpdir iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration) # Every N iterations, we check if we got below the tolerance level. if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0): new_ranks_df.persist() ranks_df.persist() vertex_df.persist() stats_df = sql(sqlc, """ SELECT sum(diff) as sum_diff, count(*) as count_diff, min(diff) as min_diff, max(diff) as max_diff, avg(diff) as avg_diff, stddev(diff) as stddev_diff FROM ( SELECT ABS(old_ranks.rank - new_ranks.rank) diff FROM old_ranks JOIN new_ranks ON old_ranks.id = new_ranks.id WHERE old_ranks.rank != new_ranks.rank ) diffs """, {"old_ranks": ranks_df, "new_ranks": new_ranks_df}) stats = stats_df.collect()[0] print("Max diff at iteration %s : %s" % (iteration, stats["max_diff"])) print("Other stats: %s" % repr(stats)) if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol): print("Max diff was below tolerance: stopping iterations!") break top_diffs_df = sql(sqlc, """ SELECT (new_ranks.rank - old_ranks.rank) diff, old_ranks.rank old_rank, new_ranks.rank new_rank, names.domain domain FROM old_ranks JOIN new_ranks ON old_ranks.id = new_ranks.id JOIN names ON names.id = old_ranks.id WHERE old_ranks.rank != new_ranks.rank ORDER BY ABS(diff) DESC """, {"old_ranks": ranks_df, "new_ranks": new_ranks_df, "names": vertex_df}) print("Top 100 diffs") print("\n".join(["%3.3f %3.3f %3.3f %s " % x for x in top_diffs_df.limit(100).collect()])) new_ranks_df.write.parquet(iteration_tmpdir) # S3 in us-east-1 should support read-after-write consistency since 2015 # but we still have transient errors self.wait_for_tmpdir(iteration_tmpdir) new_ranks_df.unpersist() ranks_df.unpersist() ranks_df = sqlc.read.load(iteration_tmpdir) if iteration_tmpdir_previous is not None: self.clean_tmpdir(directory=iteration_tmpdir_previous) # No more need for the edges after iterations edge_df.unpersist() final_df = sql(sqlc, """ SELECT CONCAT(names.domain, ' ', ranks.rank) r FROM ranks JOIN names ON names.id = ranks.id ORDER BY ranks.rank DESC """, {"names": vertex_df, "ranks": ranks_df}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print(final_df.rdd.collect())
def custom_pagerank(self, sc, sqlc): """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """ # pylint: disable=too-many-statements # sc.setCheckpointDir("/tmp/spark-checkpoints") edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) if self.args.maxedges: edge_df = edge_df.limit(self.args.maxedges) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) if self.args.maxvertices: vertex_df = vertex_df.limit(self.args.maxvertices) sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions)) edge_df.persist(StorageLevel.MEMORY_AND_DISK) vertex_df.persist(StorageLevel.MEMORY_AND_DISK) print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count())) # TODO: bootstrap with previous pageranks to accelerate convergence? ranks_df = sql(sqlc, """ SELECT vertices.id id, cast(1.0 as float) rank FROM vertices JOIN edges ON edges.dst = vertices.id GROUP BY vertices.id """, {"vertices": vertex_df, "edges": edge_df}) # TODO: optimize further by taking out outDegree=0 vertices and computing their pagerank # as a post-filter. # LEFT OUTER JOIN edges edges_src on edges_src.src = vertices.id # WHERE edges_src.src IS NOT NULL iteration_tmpdir = None for iteration in range(self.args.maxiter): changed_ranks_df = sql(sqlc, """ SELECT edges.dst id, cast( 0.15 + 0.85 * sum(COALESCE(ranks_src.rank, 0.15) * edges.weight) as float ) rank_new, first(ranks_dst.rank) rank_old FROM edges LEFT OUTER JOIN ranks_src ON edges.src = ranks_src.id LEFT OUTER JOIN ranks_dst ON edges.dst = ranks_dst.id GROUP BY edges.dst HAVING ABS(rank_old - rank_new) > %s """ % self.args.precision, {"ranks_src": ranks_df, "ranks_dst": ranks_df, "edges": edge_df}) # Every N iterations, we check if we got below the tolerance level. if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0): changed_ranks_df.persist(StorageLevel.MEMORY_AND_DISK) stats_df = sql(sqlc, """ SELECT sum(abs(rank_new - rank_old)) as sum_diff, count(*) as count_diff, min(abs(rank_new - rank_old)) as min_diff, max(abs(rank_new - rank_old)) as max_diff, avg(abs(rank_new - rank_old)) as avg_diff, stddev(abs(rank_new - rank_old)) as stddev_diff FROM changes """, {"changes": changed_ranks_df}) stats = stats_df.collect()[0] print("Iteration %s, %s changed ranks" % (iteration, stats["count_diff"])) print("Stats: %s" % repr(stats)) if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol): print("Max diff was below tolerance: stopping iterations!") break if self.args.top_diffs > 0: top_changes_df = sql(sqlc, """ SELECT (rank_new - rank_old) diff, rank_old, rank_new, names.domain domain FROM changes JOIN names ON names.id = changes.id ORDER BY abs(rank_new - rank_old) DESC """, {"changes": changed_ranks_df, "names": vertex_df}) print("Top %s diffs" % self.args.top_changes) print("\n".join([ "%3.3f (%3.3f => %3.3f) %s " % x for x in top_changes_df.limit(self.args.top_diffs).collect() ])) top_changes_df.unpersist() new_ranks_df = sql(sqlc, """ SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank FROM ranks LEFT JOIN changed_ranks ON changed_ranks.id = ranks.id """, {"ranks": ranks_df, "changed_ranks": changed_ranks_df}) if (iteration + 1) % 5 != 0: new_ranks_df.persist(StorageLevel.MEMORY_AND_DISK) new_ranks_df.count() # Materialize the RDD print("Iteration %s cached" % (iteration, )) ranks_df.unpersist() changed_ranks_df.unpersist() ranks_df = new_ranks_df # At this point we need to break the RDD dependency chain # Writing & loading Parquet seems to be more efficient than checkpointing the RDD. else: print("Iteration %s, saving to parquet" % iteration) iteration_tmpdir_previous = iteration_tmpdir iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration) new_ranks_df.write.parquet(iteration_tmpdir) # S3 in us-east-1 should support read-after-write consistency since 2015 # but we still have transient errors self.wait_for_tmpdir(iteration_tmpdir) new_ranks_df.unpersist() ranks_df.unpersist() changed_ranks_df.unpersist() ranks_df = sqlc.read.load(iteration_tmpdir) if iteration_tmpdir_previous is not None: self.clean_tmpdir(directory=iteration_tmpdir_previous) if self.args.include_orphans: ranks_df = ranks_df.unionAll(sql(sqlc, """ SELECT vertices.id id, cast(0.15 as float) rank FROM vertices LEFT OUTER JOIN edges ON edges.dst = vertices.id WHERE edges.dst is NULL """, {"vertices": vertex_df, "edges": edge_df})) # No more need for the edges after iterations edge_df.unpersist() final_df = sql(sqlc, """ SELECT CONCAT(names.domain, ' ', ranks.rank) r FROM ranks JOIN names ON names.id = ranks.id ORDER BY ranks.rank DESC """, {"names": vertex_df, "ranks": ranks_df}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print(final_df.rdd.collect())
def custom_pagerank_2(self, sc, sqlc): """ Alternative PageRank implementation, with fixed number of steps """ sc.setCheckpointDir("/tmp/spark-checkpoints") # ranks_schema = SparkTypes.StructType([ # SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), # SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False) # ]) edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) if self.args.maxedges: edge_df = edge_df.limit(self.args.maxedges) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) if self.args.maxvertices: vertex_df = vertex_df.limit(self.args.maxvertices) sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions)) # TODO: bootstrap with previous pageranks to accelerate convergence? ranks_df = sql( sqlc, """ SELECT id, cast(1.0 as float) rank FROM vertices """, {"vertices": vertex_df}) edge_df.persist() vertex_df.persist() print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count())) iteration_tmpdir = None for iteration in range(self.args.maxiter): new_ranks_df = sql( sqlc, """ SELECT ranks.id id, cast(0.15 + 0.85 * COALESCE(contribs.contrib, 0) as float) rank FROM ranks LEFT OUTER JOIN ( SELECT edges.dst id, cast(sum(ranks.rank * COALESCE(edges.weight, 0)) as float) contrib FROM edges LEFT OUTER JOIN ranks ON edges.src = ranks.id GROUP BY edges.dst ) contribs ON contribs.id = ranks.id """, { "ranks": ranks_df, "edges": edge_df }) # At this point we need to break the RDD dependency chain # Writing & loading Parquet seems to be more efficient than checkpointing the RDD. iteration_tmpdir_previous = iteration_tmpdir iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration) # Every N iterations, we check if we got below the tolerance level. if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0): new_ranks_df.persist() ranks_df.persist() vertex_df.persist() stats_df = sql( sqlc, """ SELECT sum(diff) as sum_diff, count(*) as count_diff, min(diff) as min_diff, max(diff) as max_diff, avg(diff) as avg_diff, stddev(diff) as stddev_diff FROM ( SELECT ABS(old_ranks.rank - new_ranks.rank) diff FROM old_ranks JOIN new_ranks ON old_ranks.id = new_ranks.id WHERE old_ranks.rank != new_ranks.rank ) diffs """, { "old_ranks": ranks_df, "new_ranks": new_ranks_df }) stats = stats_df.collect()[0] print("Max diff at iteration %s : %s" % (iteration, stats["max_diff"])) print("Other stats: %s" % repr(stats)) if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol): print("Max diff was below tolerance: stopping iterations!") break top_diffs_df = sql( sqlc, """ SELECT (new_ranks.rank - old_ranks.rank) diff, old_ranks.rank old_rank, new_ranks.rank new_rank, names.domain domain FROM old_ranks JOIN new_ranks ON old_ranks.id = new_ranks.id JOIN names ON names.id = old_ranks.id WHERE old_ranks.rank != new_ranks.rank ORDER BY ABS(diff) DESC """, { "old_ranks": ranks_df, "new_ranks": new_ranks_df, "names": vertex_df }) print("Top 100 diffs") print("\n".join([ "%3.3f %3.3f %3.3f %s " % x for x in top_diffs_df.limit(100).collect() ])) new_ranks_df.write.parquet(iteration_tmpdir) # S3 in us-east-1 should support read-after-write consistency since 2015 # but we still have transient errors self.wait_for_tmpdir(iteration_tmpdir) new_ranks_df.unpersist() ranks_df.unpersist() ranks_df = sqlc.read.load(iteration_tmpdir) if iteration_tmpdir_previous is not None: self.clean_tmpdir(directory=iteration_tmpdir_previous) # No more need for the edges after iterations edge_df.unpersist() final_df = sql( sqlc, """ SELECT CONCAT(names.domain, ' ', ranks.rank) r FROM ranks JOIN names ON names.id = ranks.id ORDER BY ranks.rank DESC """, { "names": vertex_df, "ranks": ranks_df }) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none") else: print(final_df.rdd.collect())
def custom_pagerank(self, sc, sqlc): """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """ # pylint: disable=too-many-statements # sc.setCheckpointDir("/tmp/spark-checkpoints") edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) if self.args.maxedges: edge_df = edge_df.limit(self.args.maxedges) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) if self.args.maxvertices: vertex_df = vertex_df.limit(self.args.maxvertices) sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions)) edge_df.persist(StorageLevel.MEMORY_AND_DISK) vertex_df.persist(StorageLevel.MEMORY_AND_DISK) print("Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count())) # TODO: bootstrap with previous pageranks to accelerate convergence? ranks_df = sql( sqlc, """ SELECT vertices.id id, cast(1.0 as float) rank FROM vertices JOIN edges ON edges.dst = vertices.id GROUP BY vertices.id """, { "vertices": vertex_df, "edges": edge_df }) # TODO: optimize further by taking out outDegree=0 vertices and computing their pagerank # as a post-filter. # LEFT OUTER JOIN edges edges_src on edges_src.src = vertices.id # WHERE edges_src.src IS NOT NULL iteration_tmpdir = None for iteration in range(self.args.maxiter): changed_ranks_df = sql( sqlc, """ SELECT edges.dst id, cast( 0.15 + 0.85 * sum(COALESCE(ranks_src.rank, 0.15) * edges.weight) as float ) rank_new, first(ranks_dst.rank) rank_old FROM edges LEFT OUTER JOIN ranks_src ON edges.src = ranks_src.id LEFT OUTER JOIN ranks_dst ON edges.dst = ranks_dst.id GROUP BY edges.dst HAVING ABS(rank_old - rank_new) > %s """ % self.args.precision, { "ranks_src": ranks_df, "ranks_dst": ranks_df, "edges": edge_df }) # Every N iterations, we check if we got below the tolerance level. if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0): changed_ranks_df.persist(StorageLevel.MEMORY_AND_DISK) stats_df = sql( sqlc, """ SELECT sum(abs(rank_new - rank_old)) as sum_diff, count(*) as count_diff, min(abs(rank_new - rank_old)) as min_diff, max(abs(rank_new - rank_old)) as max_diff, avg(abs(rank_new - rank_old)) as avg_diff, stddev(abs(rank_new - rank_old)) as stddev_diff FROM changes """, {"changes": changed_ranks_df}) stats = stats_df.collect()[0] print("Iteration %s, %s changed ranks" % (iteration, stats["count_diff"])) print("Stats: %s" % repr(stats)) if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol): print("Max diff was below tolerance: stopping iterations!") break if self.args.top_diffs > 0: top_changes_df = sql( sqlc, """ SELECT (rank_new - rank_old) diff, rank_old, rank_new, names.domain domain FROM changes JOIN names ON names.id = changes.id ORDER BY abs(rank_new - rank_old) DESC """, { "changes": changed_ranks_df, "names": vertex_df }) print("Top %s diffs" % self.args.top_changes) print("\n".join([ "%3.3f (%3.3f => %3.3f) %s " % x for x in top_changes_df.limit(self.args.top_diffs).collect() ])) top_changes_df.unpersist() new_ranks_df = sql( sqlc, """ SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank FROM ranks LEFT JOIN changed_ranks ON changed_ranks.id = ranks.id """, { "ranks": ranks_df, "changed_ranks": changed_ranks_df }) if (iteration + 1) % 5 != 0: new_ranks_df.persist(StorageLevel.MEMORY_AND_DISK) new_ranks_df.count() # Materialize the RDD print("Iteration %s cached" % (iteration, )) ranks_df.unpersist() changed_ranks_df.unpersist() ranks_df = new_ranks_df # At this point we need to break the RDD dependency chain # Writing & loading Parquet seems to be more efficient than checkpointing the RDD. else: print("Iteration %s, saving to parquet" % iteration) iteration_tmpdir_previous = iteration_tmpdir iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration) new_ranks_df.write.parquet(iteration_tmpdir) # S3 in us-east-1 should support read-after-write consistency since 2015 # but we still have transient errors self.wait_for_tmpdir(iteration_tmpdir) new_ranks_df.unpersist() ranks_df.unpersist() changed_ranks_df.unpersist() ranks_df = sqlc.read.load(iteration_tmpdir) if iteration_tmpdir_previous is not None: self.clean_tmpdir(directory=iteration_tmpdir_previous) if self.args.include_orphans: ranks_df = ranks_df.unionAll( sql( sqlc, """ SELECT vertices.id id, cast(0.15 as float) rank FROM vertices LEFT OUTER JOIN edges ON edges.dst = vertices.id WHERE edges.dst is NULL """, { "vertices": vertex_df, "edges": edge_df })) # No more need for the edges after iterations edge_df.unpersist() final_df = sql( sqlc, """ SELECT CONCAT(names.domain, ' ', ranks.rank) r FROM ranks JOIN names ON names.id = ranks.id ORDER BY ranks.rank DESC """, { "names": vertex_df, "ranks": ranks_df }) if self.args.dump: final_df.coalesce(1).write.format('text').mode( self.get_write_mode()).save( self.args.dump, compression="gzip" if self.args.gzip else "none") else: print(final_df.rdd.collect())
def save_edge_graph(self, sqlc, df): """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """ edge_graph_schema = SparkTypes.StructType([ SparkTypes.StructField("src", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("dst", SparkTypes.LongType(), nullable=False), # Sum of weights must be 1 # This field will automatically be added by the SQL query # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True) ]) # TODO?! if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) # Get all unique (host1 => host2) pairs new_df = sql(sqlc, """ SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2 FROM ( SELECT url, EXPLODE(external_links.href) as link FROM df ) as pairs """, {"df": df}).distinct() def iter_links_domain(record): """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """ d1 = record["d1"] d2 = record["d2"] if not d1 or not d2: return [] try: from_domain = _fast_make_domain_id(d1) to_domain = _fast_make_domain_id(d2) except Exception: # pylint: disable=broad-except return [] if from_domain == to_domain: return [] else: return [(py2_long(from_domain), py2_long(to_domain))] rdd_couples = new_df.rdd.flatMap(iter_links_domain) edge_df = createDataFrame(sqlc, rdd_couples, edge_graph_schema).distinct() # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge # The current algorithm is naive: edge weight is equally split between all the links, with # the sum of all weights for a source domain always = 1. weights_df = sql(sqlc, """ SELECT src id, cast(1 / count(*) as float) weight FROM edges GROUP BY src """, {"edges": edge_df}) weighted_edge_df = sql(sqlc, """ SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight FROM edges JOIN weights on edges.src = weights.id """, {"edges": edge_df, "weights": weights_df}) coalesce = int(self.args.get("coalesce_edges") or self.args.get("coalesce", 1) or 0) if coalesce > 0: weighted_edge_df = weighted_edge_df.coalesce(coalesce) weighted_edge_df.write.parquet(os.path.join(self.args["path"], "edges"))
def custom_pagerank(self, sc, sqlc): """ Our own PageRank implementation, based on Spark SQL and Pregel-like behaviour """ sc.setCheckpointDir("/tmp/spark-checkpoints") edge_df = sqlc.read.load(self.args.edges) if self.args.maxedges: edge_df = edge_df.limit(self.args.maxedges) vertex_df = sqlc.read.load(self.args.vertices) if self.args.maxvertices: vertex_df = vertex_df.limit(self.args.maxvertices) sqlc.setConf("spark.sql.shuffle.partitions", str(self.args.shuffle_partitions)) # TODO: bootstrap with previous pageranks to accelerate convergence? ranks_df = sql(sqlc, """ SELECT id, cast(0.15 as float) rank FROM vertices """, {"vertices": vertex_df}) edge_df.persist() vertex_df.persist() print "Starting iterations. %s edges, %s vertices." % (edge_df.count(), vertex_df.count()) iteration_tmpdir = None for iteration in range(self.args.maxiter): # We cast as strings because of https://issues.apache.org/jira/browse/SPARK-16802 # TODO: remove them once it's fixed! # changed_ranks_df = sql(sqlc, """ SELECT cast(edges.dst as string) id, cast( 0.15 + 0.85 * sum(ranks_src.rank * edges.weight) as float ) rank_new, first(ranks_dst.rank) rank_old FROM edges LEFT OUTER JOIN ranks_src ON cast(edges.src as string) = cast(ranks_src.id as string) LEFT OUTER JOIN ranks_dst ON cast(edges.dst as string) = cast(ranks_dst.id as string) GROUP BY cast(edges.dst as string) HAVING ABS(rank_old - rank_new) > %s """ % self.args.precision, {"ranks_src": ranks_df, "ranks_dst": ranks_df, "edges": edge_df}) # Every N iterations, we check if we got below the tolerance level. if (self.args.tol >= 0 or self.args.stats > 0) and (iteration % self.args.stats == 0): changed_ranks_df.persist() stats_df = sql(sqlc, """ SELECT sum(abs(rank_new - rank_old)) as sum_diff, count(*) as count_diff, min(abs(rank_new - rank_old)) as min_diff, max(abs(rank_new - rank_old)) as max_diff, avg(abs(rank_new - rank_old)) as avg_diff, stddev(abs(rank_new - rank_old)) as stddev_diff FROM changes """, {"changes": changed_ranks_df}) stats = stats_df.collect()[0] print "Iteration %s, %s changed ranks" % (iteration, stats["count_diff"]) print "Stats: %s" % repr(stats) if (stats["count_diff"] == 0) or (stats["max_diff"] <= self.args.tol): print "Max diff was below tolerance: stopping iterations!" break top_changes_df = sql(sqlc, """ SELECT (rank_new - rank_old) diff, rank_old, rank_new, names.domain domain FROM changes JOIN names ON names.id = changes.id ORDER BY abs(rank_new - rank_old) DESC """, {"changes": changed_ranks_df, "names": vertex_df}) print "Top 20 diffs" print "\n".join([ "%3.3f (%3.3f => %3.3f) %s " % x for x in top_changes_df.limit(20).collect() ]) new_ranks_df = sql(sqlc, """ SELECT ranks.id id, COALESCE(changed_ranks.rank_new, ranks.rank) rank FROM ranks LEFT JOIN changed_ranks ON cast(changed_ranks.id as string) = cast(ranks.id as string) """, {"ranks": ranks_df, "changed_ranks": changed_ranks_df}) # At this point we need to break the RDD dependency chain # Writing & loading Parquet seems to be more efficient than checkpointing the RDD. iteration_tmpdir_previous = iteration_tmpdir iteration_tmpdir = os.path.join(self.args.tmpdir, "iter_%s" % iteration) new_ranks_df.write.parquet(iteration_tmpdir) # S3 in us-east-1 should support read-after-write consistency since 2015 # but we still have transient errors self.wait_for_tmpdir(iteration_tmpdir) new_ranks_df.unpersist() ranks_df.unpersist() changed_ranks_df.unpersist() ranks_df = sqlc.read.load(iteration_tmpdir) if iteration_tmpdir_previous is not None: self.clean_tmpdir(directory=iteration_tmpdir_previous) # No more need for the edges after iterations edge_df.unpersist() final_df = sql(sqlc, """ SELECT CONCAT(names.domain, ' ', ranks.rank) r FROM ranks JOIN names ON cast(names.id as string) = cast(ranks.id as string) ORDER BY ranks.rank DESC """, {"names": vertex_df, "ranks": ranks_df}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print final_df.rdd.collect()
def save_edge_graph(self, sqlc, df): """ Transforms a document metadata DataFrame into a Parquet dump of the edges of the webgraph """ edge_graph_schema = SparkTypes.StructType([ SparkTypes.StructField("src", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("dst", SparkTypes.LongType(), nullable=False), # Sum of weights must be 1 # This field will automatically be added by the SQL query # SparkTypes.StructField("weight", SparkTypes.FloatType(), nullable=True) ]) # TODO?! if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) # Get all unique (host1 => host2) pairs new_df = sql( sqlc, """ SELECT parse_url(url, "HOST") as d1, parse_url(CONCAT("http://", link), "HOST") as d2 FROM ( SELECT url, EXPLODE(external_links.href) as link FROM df ) as pairs """, { "df": df }).distinct() def iter_links_domain(record): """ Transforms Row(d1="x.com", d2="y.com") into tuple([int64 ID], [int64 ID]) """ d1 = record["d1"] d2 = record["d2"] if not d1 or not d2: return [] try: from_domain = _fast_make_domain_id(d1) to_domain = _fast_make_domain_id(d2) except Exception: # pylint: disable=broad-except return [] if from_domain == to_domain: return [] else: return [(py2_long(from_domain), py2_long(to_domain))] rdd_couples = new_df.rdd.flatMap(iter_links_domain) edge_df = createDataFrame(sqlc, rdd_couples, edge_graph_schema).distinct() # After collecting all the unique (from_id, to_id) pairs, we add the weight of every edge # The current algorithm is naive: edge weight is equally split between all the links, with # the sum of all weights for a source domain always = 1. weights_df = sql( sqlc, """ SELECT src id, cast(1 / count(*) as float) weight FROM edges GROUP BY src """, {"edges": edge_df}) weighted_edge_df = sql( sqlc, """ SELECT cast(src as long) src, cast(dst as long) dst, cast(weights.weight as float) weight FROM edges JOIN weights on edges.src = weights.id """, { "edges": edge_df, "weights": weights_df }) coalesce = int( self.args.get("coalesce_edges") or self.args.get("coalesce", 1) or 0) if coalesce > 0: weighted_edge_df = weighted_edge_df.coalesce(coalesce) weighted_edge_df.write.parquet( os.path.join(self.args["output"], "edges"))
def save_vertex_graph(self, sqlc, df): """ Transforms a document metadata DataFrame into a Parquet dump of the vertices of the webgraph """ vertex_graph_schema = SparkTypes.StructType([ SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("domain", SparkTypes.StringType(), nullable=False) ]) # TODO ?! if self.args.get("shuffle_partitions"): sqlc.setConf("spark.sql.shuffle.partitions", self.args["shuffle_partitions"]) # We collect all unique domains from the page URLs & destination of all external links d1_df = sql( sqlc, """ SELECT parse_url(url, "HOST") as domain from df """, { "df": df }).distinct() d2_df = sql( sqlc, """ SELECT parse_url(CONCAT("http://", link), "HOST") as domain FROM ( SELECT EXPLODE(external_links.href) as link FROM df ) as pairs """, {"df": df}) all_domains_df = d1_df.unionAll(d2_df).distinct() def iter_domain(record): """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """ domain = record["domain"] if not domain or not domain.strip(): return [] name = URL("http://" + domain).normalized_domain try: _id = _fast_make_domain_id(name) except Exception: # pylint: disable=broad-except return [] return [(py2_long(_id), str(name))] rdd_domains = all_domains_df.rdd.flatMap(iter_domain) vertex_df = createDataFrame(sqlc, rdd_domains, vertex_graph_schema).distinct() coalesce = int( self.args.get("coalesce_vertices") or self.args.get("coalesce", 1) or 0) if coalesce > 0: vertex_df = vertex_df.coalesce(coalesce) vertex_df.write.parquet(os.path.join(self.args["output"], "vertices"))