def index_from_source(self, source, partition, _indexer, **kwargs): """ Indexes all documents from a source """ for document in source.iter_documents(partition): print("Indexing", document.source_url.url) metadata = {} exec_hook(self.plugins, "document_pre_index", document, metadata) metadata.update(_indexer.index_document(document, **kwargs)) exec_hook(self.plugins, "document_post_index", document, metadata) yield metadata
def index_from_source(self, source, _indexer, **kwargs): """ Indexes all documents from a source """ # plugins = load_plugins(plugin_list) for document in source.iter_documents(): print "Indexing", document.source_url.url metadata = {} exec_hook(self.plugins, "document_pre_index", document, metadata) metadata.update(_indexer.index_document(document, **kwargs)) exec_hook(self.plugins, "document_post_index", document, metadata) yield metadata
def run_job(self, sc, sqlc): """ Execute our indexing pipeline with a Spark Context """ self.plugins = load_plugins(self.args.plugin) self.accumulator_indexed = sc.accumulator(0) maxdocs = {} # What fields will be sent to Spark document_schema_columns = [ SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("url", SparkTypes.StringType(), nullable=False), SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False) ] # Some plugins need to add new fields to the schema exec_hook( self.plugins, "spark_pipeline_init", sc, sqlc, document_schema_columns, indexer ) exec_hook(self.plugins, "document_schema", document_schema_columns) document_schema = SparkTypes.StructType(document_schema_columns) # Spark DataFrame containing everything we indexed all_documents = None executed_pipeline = False for source_spec in self.args.source: source_documents = None source_name, source_args = parse_plugin_cli_args(source_spec) maxdocs[source_spec] = source_args.get("maxdocs") if source_name == "commoncrawl": partitions = list_commoncrawl_warc_filenames( limit=source_args.get("limit"), skip=source_args.get("skip"), version=source_args.get("version") ) def index_partition(filename): ds = load_source("commoncrawl", { "file": filename, "plugins": self.plugins, "maxdocs": maxdocs[source_spec] # pylint: disable=cell-var-from-loop }) return self.index_documents(ds) elif source_name == "warc": # We have been given a .txt file with a list of WARC file paths if source_args["file"].endswith(".txt"): with open(source_args["file"], "rb") as f: partitions = [x.strip() for x in f.readlines()] # Single WARC file path else: partitions = [source_args["file"]] def index_partition(filename): ds = load_source("webarchive", { "file": filename, "plugins": self.plugins, "maxdocs": maxdocs[source_spec] # pylint: disable=cell-var-from-loop }) return self.index_documents(ds) elif source_name == "wikidata": partitions = ["__wikidata_single_dump__"] def index_partition(_): ds = load_source("wikidata", { "maxdocs": maxdocs[source_spec], # pylint: disable=cell-var-from-loop "plugins": self.plugins }) return self.index_documents(ds) elif source_name == "corpus": partitions = source_args.get("docs", ["__from_file__"]) path = source_args.get("path") def index_partition(doc): ds = load_source("corpus", { "maxdocs": maxdocs[source_spec], # pylint: disable=cell-var-from-loop "docs": [doc], "path": path, # pylint: disable=cell-var-from-loop "plugins": self.plugins }) return self.index_documents(ds) elif source_name == "url": partitions = source_args.get("urls") or [source_args["url"]] def index_partition(url): ds = load_source("url", { "urls": [url], "plugins": self.plugins }) return self.index_documents(ds) elif source_name == "parquet": # Read an intermediate dump of document metadata generated by # --plugin plugins.dump.DocumentMetadataParquet df = sqlc.read.parquet(source_args["path"]) if maxdocs[source_spec]: df = df.limit(int(maxdocs[source_spec])) if source_args.get("fields"): df = df.select(source_args["fields"].split("+")) source_documents = df # Split indexing of each partition in Spark workers if source_documents is None: executed_pipeline = False rdd = sc \ .parallelize(partitions, len(partitions)) \ .flatMap(index_partition) source_documents = createDataFrame(sqlc, rdd, document_schema) if source_args.get("persist") == "1": source_documents.persist(StorageLevel.MEMORY_AND_DISK) # The count() here will execute the pipeline so far to allow for sources to be done sequentially if source_args.get("block") == "1": executed_pipeline = True print "Source %s done, indexed %s documents (%s total so far)" % ( source_name, source_documents.rdd.count(), self.accumulator_indexed.value ) if all_documents is None: all_documents = source_documents else: all_documents = all_documents.unionAll(source_documents) done_actions = exec_hook( self.plugins, "spark_pipeline_action", sc, sqlc, all_documents, indexer ) # If no action was done, we need to do a count() to actually execute the spark pipeline if any(done_actions): executed_pipeline = True if not executed_pipeline: print "Total documents: %s" % all_documents.rdd.count()
def run_job(self, sc, sqlc): """ Execute our indexing pipeline with a Spark Context """ self.plugins = load_plugins(self.args.plugin) self.accumulator_indexed = sc.accumulator(0) # What fields will be sent to Spark document_schema_columns = [ SparkTypes.StructField("id", SparkTypes.LongType(), nullable=False), SparkTypes.StructField("url", SparkTypes.StringType(), nullable=False), SparkTypes.StructField("rank", SparkTypes.FloatType(), nullable=False) ] # Some plugins need to add new fields to the schema exec_hook( self.plugins, "spark_pipeline_init", sc, sqlc, document_schema_columns, indexer ) exec_hook(self.plugins, "document_schema", document_schema_columns) document_schema = SparkTypes.StructType(document_schema_columns) # Spark DataFrame containing everything we indexed all_documents = None executed_pipeline = False for source_spec in self.args.source: source_name, source_args = parse_plugin_cli_args(source_spec) ds = load_source(source_name, source_args, plugins=self.plugins) source_documents, needs_execution = self.get_indexed_documents_from_source( sc, sqlc, document_schema, ds ) executed_pipeline = executed_pipeline and (not needs_execution) # # At this point, we have a DataFrame with every document from this source. # if source_args.get("persist") == "1": source_documents.persist(StorageLevel.MEMORY_AND_DISK) # The count() here will execute the pipeline so far to allow for sources to be done sequentially if source_args.get("block") == "1": executed_pipeline = True print("Source %s done, indexed %s documents (%s total so far)" % ( source_name, source_documents.rdd.count(), self.accumulator_indexed.value )) if all_documents is None: all_documents = source_documents else: all_documents = all_documents.unionAll(source_documents) # # At this point, we have a DataFrame with all documents from all sources. # done_actions = exec_hook( self.plugins, "spark_pipeline_action", sc, sqlc, all_documents, indexer ) # If no action was done, we need to do a count() to actually # execute ("materialize") the spark pipeline if any(done_actions): executed_pipeline = True if not executed_pipeline: print("Total documents: %s" % all_documents.rdd.count())