def run(self, bucket_name, now, input_files, version_id, parent_params=None): logging.info('======= Starting Metrics Pipeline') mapper_params = { 'input_reader': { GCSInputReader.BUCKET_NAME_PARAM: bucket_name, GCSInputReader.OBJECT_NAMES_PARAM: input_files } } if parent_params: mapper_params.update(parent_params) num_shards = mapper_params[_NUM_SHARDS] # Chain together three map reduces; see module comments blob_key_1 = (yield mapreduce_pipeline.MapreducePipeline( 'Process Input CSV', mapper_spec='offline.metrics_pipeline.map_csv_to_participant_and_date_metric', input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader', output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', mapper_params=mapper_params, reducer_spec='offline.metrics_pipeline.reduce_participant_data_to_hpo_metric_date_deltas', reducer_params={ 'now': now, 'output_writer': { 'bucket_name': bucket_name, 'content_type': 'text/plain' } }, shards=num_shards)) blob_key_2 = (yield mapreduce_pipeline.MapreducePipeline( 'Calculate Counts', mapper_spec='offline.metrics_pipeline.map_hpo_metric_date_deltas_to_hpo_metric_key', input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader', output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', mapper_params=(yield BlobKeys(bucket_name, blob_key_1, now, version_id)), combiner_spec='offline.metrics_pipeline.combine_hpo_metric_date_deltas', reducer_spec='offline.metrics_pipeline.reduce_hpo_metric_date_deltas_to_all_date_counts', reducer_params={ 'now': now, 'output_writer': { 'bucket_name': bucket_name, 'content_type': 'text/plain', } }, shards=num_shards)) # TODO(danrodney): # We need to find a way to delete data written above (DA-167) yield mapreduce_pipeline.MapreducePipeline( 'Write Metrics', mapper_spec='offline.metrics_pipeline.map_hpo_metric_date_counts_to_hpo_date_key', input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader', mapper_params=(yield BlobKeys(bucket_name, blob_key_2, now, version_id)), reducer_spec='offline.metrics_pipeline.reduce_hpo_date_metric_counts_to_database_buckets', reducer_params={ 'version_id': version_id }, shards=num_shards)
def run(self, filekey, blobkey): bucket_name = app_identity.get_default_gcs_bucket_name() combine_purchase_key = yield mapreduce_pipeline.MapreducePipeline( "combine_purchase", "main.combine_purchase_map", "main.combine_purchase_reduce", "mapreduce.input_readers.BlobstoreZipInputReader", "mapreduce.output_writers.GoogleCloudStorageOutputWriter", mapper_params={ "blob_key": blobkey, }, reducer_params={ "output_writer": { "bucket_name": bucket_name, "content_type": "text/plain", } }, shards=16) song_pairs = yield mapreduce_pipeline.MapreducePipeline( "common_purchase", "main.common_purchase_map", "main.common_purchase_reduce", "mapreduce.input_readers.GoogleCloudStorageInputReader", "mapreduce.output_writers.GoogleCloudStorageOutputWriter", # Pass output from first job as input to second job mapper_params= (yield GCSMapperParams(combine_purchase_key)), reducer_params={ "output_writer": { "bucket_name": bucket_name, "content_type": "text/plain", } }, shards=16) most_purchased = yield mapreduce_pipeline.MapreducePipeline( "find_most_common", "main.find_most_common_map", "main.find_most_common_reduce", "mapreduce.input_readers.GoogleCloudStorageInputReader", "mapreduce.output_writers.GoogleCloudStorageOutputWriter", # Pass output from first job as input to second job mapper_params= (yield GCSMapperParams(song_pairs)), reducer_params={ "output_writer": { "bucket_name": bucket_name, "content_type": "text/plain", } }, shards=16) yield StoreOutput("find_most_common", filekey, most_purchased)
def testFailedMapReduce(self): # Add some random data. entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_failed_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + ".BlobstoreRecordsOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_FAILED, p.outputs.result_status.value) self.assertEqual(0, len(p.outputs.default.value))
def run(self, region, include_conditional_violations): """ Yields a recidivism calculation MapReduce pipeline to be started. :param region: a specific region to calculate recidivism for. Calculates for all regions of this is None. :param include_conditional_violations: whether or not to include violations of conditional release in recidivism calculations (split out into separate metrics) :return: yields up a MapReduce pipeline to start """ mapper_params = { "entity_kind": "models.inmate.Inmate", "include_conditional_violations": include_conditional_violations } if region: mapper_params["filters"] = [("region", "=", region)] yield mapreduce_pipeline.MapreducePipeline( "Calculate recidivism across various dimensions", input_reader_spec="mapreduce.input_readers.DatastoreInputReader", mapper_spec="calculator.pipeline.map_inmate", mapper_params=mapper_params, reducer_spec="calculator.pipeline.reduce_recidivism_events", shards=64)
def run(self, filename, blobkey, ds_key): params = "filename %s \tblobkey %s\tds_key %s" % (filename, blobkey, ds_key) logging.info(params) dataset = ndb.Key(urlsafe=ds_key).get() rows = dataset.rows hashes = rows * dataset.bands if len(dataset.random_seeds) != hashes: dataset.random_seeds = [ random.getrandbits(max_bits) for _ in xrange(hashes) ] logging.warning('Recalculated %d random seeds', hashes) dataset.put() dataset.buckets = [] dataset.put() output = yield mapreduce_pipeline.MapreducePipeline( "locality_sensitive_hashing", "blobs.lsh_map", "blobs.lsh_bucket", 'mapreduce.input_readers.BlobstoreZipLineInputReader', "mapreduce.output_writers.BlobstoreOutputWriter", mapper_params={ "blob_keys": blobkey, }, reducer_params={ "mime_type": "text/plain", }, shards=16) yield StoreLshResults('OpenLSH', blobkey, ds_key, output)
def run(self, job_id, job_class_str, kwargs): # Disabling 4 space indentation checker for this docstring because this # "Yields:" section yields 2 objects and the Yields/Returns are # generally supposed to only yield 1 object which messes up the # indentation checking. This is the only case of this happening. """Returns a coroutine which runs the job pipeline and stores results. Args: job_id: str. The ID of the job to run. job_class_str: str. Should uniquely identify each type of job. kwargs: dict(str : object). Extra arguments used to build the MapreducePipeline. Yields: MapreducePipeline. Ready to start processing. Expects the output of that pipeline to be sent back. StoreMapReduceResults. Will be constructed with whatever output the caller sends back to the coroutine. """ job_class = mapreduce_util.for_name(job_class_str) job_class.register_start(job_id, metadata={ job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id # pylint: disable=protected-access }) # TODO(sll): Need try/except/mark-as-canceled here? output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_id, job_class_str, output)
def mapreduce_scrape_sources_and_process_events(fbl, min_potential_events, queue): mapper_params = { 'entity_kind': 'event_scraper.thing_db.Source', 'min_potential_events': min_potential_events, 'handle_batch_size': 20, } reducer_params = { 'output_writer': { 'bucket_name': 'dancedeets-hrd.appspot.com', 'content_type': 'text/plain', } } fb_params = fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True) mapper_params.update(fb_params) reducer_params.update(fb_params) # output = yield ... pipeline = mapreduce_pipeline.MapreducePipeline( 'Scrape sources, then load and classify the events', 'event_scraper.thing_scraper2.scrape_sources_for_events', 'event_scraper.thing_scraper2.process_events', 'mapreduce.input_readers.DatastoreInputReader', 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', mapper_params=mapper_params, reducer_params=reducer_params, shards=16, ) pipeline.start(queue_name=queue)
def testSmoke(self): """Test all handlers still works. This test doesn't care about the integrity of the job outputs. Just that things works under webapp2 framework. """ # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".TestMapreduceMap", __name__ + ".TestMapreduceReduce", input_reader_spec=input_readers.__name__ + ".RandomStringInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "input_reader": { "count": 100 }, }, reducer_params={ "output_writer": { "bucket_name": "test" }, }, shards=3) p.start() test_support.execute_until_empty(self.taskqueue) # Verify output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value)
def launch_job(job_id): """Launches a job given its key from MAPREDUCE_JOBS dict.""" assert job_id in MAPREDUCE_JOBS, 'Unknown mapreduce job id %s' % job_id job_def = MAPREDUCE_JOBS[job_id].copy() # 256 helps getting things done faster but it is very easy to burn thousands # of $ within a few hours. Don't forget to update queue.yaml accordingly. job_def.setdefault('shards', 128) job_def.setdefault('input_reader_spec', 'mapreduce.input_readers.DatastoreInputReader') job_def['mapper_params'] = job_def['mapper_params'].copy() job_def['mapper_params'].setdefault( 'bucket_name', app_identity.get_default_gcs_bucket_name()) if 'reducer_spec' in job_def: logging.info('Starting mapreduce job') pipeline = mapreduce_pipeline.MapreducePipeline(**job_def) else: logging.info('Starting mapper-only job') job_def['params'] = job_def.pop('mapper_params') pipeline = mapreduce_pipeline.MapPipeline(**job_def) pipeline.start(base_path=MAPREDUCE_PIPELINE_BASE_PATH, queue_name=MAPREDUCE_TASK_QUEUE) logging.info('Pipeline ID: %s', pipeline.pipeline_id) return pipeline.pipeline_id
def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd): bucket = get_bucket_name() shards = os.environ['MAPREDUCE_SHARDS'] # In the first pipeline, generate the raw coverage data. raw_coverage_data = yield mapreduce_pipeline.MapreducePipeline( "generate_coverage", "pipeline.generate_coverage_map", "pipeline.generate_coverage_reduce", "input_reader.GenomicsAPIInputReader", "mapreduce.output_writers._GoogleCloudStorageOutputWriter", mapper_params={ "input_reader": { "readsetId": readsetId, "sequenceName": sequenceName, "sequenceStart": sequenceStart, "sequenceEnd": sequenceEnd, }, }, reducer_params={ "output_writer": { "bucket_name": bucket, "content_type": "text/plain", }, }, shards=shards) # Since running the MR to consolidate the output take a very long time, # for now just return the individual results. yield PipelineReturnIndividualResults(readsetId, sequenceName, sequenceStart, sequenceEnd, raw_coverage_data)
def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd, useMockData): logging.debug("Running Pipeline for readsetId %s" % readsetId) bucket = os.environ['BUCKET'] # In the first pipeline, generate the raw coverage data. raw_coverage_data = yield mapreduce_pipeline.MapreducePipeline( "generate_coverage", "pipeline.generate_coverage_map", "pipeline.generate_coverage_reduce", "input_reader.GenomicsAPIInputReader", "mapreduce.output_writers._GoogleCloudStorageOutputWriter", mapper_params={ "input_reader": { "readsetId": readsetId, "sequenceName": sequenceName, "sequenceStart": sequenceStart, "sequenceEnd": sequenceEnd, "useMockData": useMockData, }, }, reducer_params={ "output_writer": { "bucket_name": bucket, "content_type": "text/plain", }, }, shards=16) # Pass the results on to the output consolidator. yield PipelineConsolidateOutput(raw_coverage_data)
def run(self, event_ids): # Can't do != comparators in our appengine mapreduce queries # filters = [('expired_oauth_token', '!=', True)] # Unfortunately, many users have a value equal to None, so can't filter on this # filters = [('expired_oauth_token', '=', False)] # So for now, let's just process all of them, and skip them inside test_user_on_events filters = [] # output = yield ... yield mapreduce_pipeline.MapreducePipeline( 'Find valid access_tokens for events', 'events.find_access_tokens.test_user_on_events', 'events.find_access_tokens.save_valid_users_to_event', 'mapreduce.input_readers.DatastoreInputReader', 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', mapper_params={ 'entity_kind': 'users.users.User', 'filters': filters, 'event_ids': ','.join(event_ids), }, reducer_params={ 'output_writer': { 'bucket_name': 'dancedeets-hrd.appspot.com', 'content_type': 'text/plain', } }, shards=2, )
def run(self, raw_coverage_data): bucket = os.environ['BUCKET'] logging.debug("Got %d raw coverage data output files to consolidate." % len(raw_coverage_data)) # Remove bucket from filenames. (Would be nice if you didn't have to do # this. paths = [] for file in raw_coverage_data: paths.append(str.replace(str(file), "/" + bucket + "/", "")) # Create another pipeline to combine the raw coverage data into a single # file. output = yield mapreduce_pipeline.MapreducePipeline( "consolidate_output", "pipeline.consolidate_output_map", "pipeline.consolidate_output_reduce", "mapreduce.input_readers._GoogleCloudStorageInputReader", "mapreduce.output_writers._GoogleCloudStorageOutputWriter", mapper_params={ "input_reader": { "bucket_name": bucket, "objects": paths, }, }, reducer_params={ "output_writer": { "bucket_name": bucket, "content_type": "text/plain", }, }, shards=1) # Return back the final output results. yield PipelineReturnResults(output)
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def run(self, mapper_key, reducer_key, file_name, language): """ run """ logging.debug("filename is %s" % file_name) bucket_name = app_identity.get_default_gcs_bucket_name() mapper_params = { "entity_kind": "src.model.Data", "mapper": mapper_key, "reducer": reducer_key } output = yield mapreduce_pipeline.MapreducePipeline( file_name, mapper_spec="src.mapreduce.interpreter." + language + "_mapper_interpreter", reducer_spec="src.mapreduce.interpreter." + language + "_reducer_interpreter", input_reader_spec="mapreduce.input_readers.DatastoreInputReader", output_writer_spec= "mapreduce.output_writers.GoogleCloudStorageOutputWriter", mapper_params=mapper_params, reducer_params={ "output_writer": { "reducer": reducer_key, "bucket_name": bucket_name, "content_type": "text/plain", } }, shards=64) # @TODO test and improve store output yield StoreOutput(output)
def run(self, records_file_blobkey): job_name = "schedulrMapReduce" logging.info( "***map ***reduce ***library ***cool****** about 2 running: %s" % records_file_blobkey) # Run Mapreduce output = yield mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".schedulr_map", __name__ + ".schedulr_reduce", input_reader_spec=input_readers.__name__ + ".BlobstoreLineInputReader", output_writer_spec=(output_writers.__name__ + ".FileOutputWriter"), mapper_params={ "input_reader": { "blob_keys": [records_file_blobkey] } }, reducer_params={ "output_writer": { "mime_type": "text/plain", "output_sharding": output_writers.FileOutputWriterBase.OUTPUT_SHARDING_NONE, "filesystem": "blobstore" }, }, shards=N_SHARDS)
def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd, raw_coverage_data): bucket = get_bucket_name() # Remove bucket from filenames. (Would be nice if you didn't have to do # this. paths = [] for file in raw_coverage_data: paths.append(str.replace(str(file), "/" + bucket + "/", "")) # Create another pipeline to combine the raw coverage data into a single # file. output = yield mapreduce_pipeline.MapreducePipeline( "consolidate_output", "pipeline.consolidate_output_map", "pipeline.consolidate_output_reduce", "mapreduce.input_readers._GoogleCloudStorageInputReader", "mapreduce.output_writers._GoogleCloudStorageOutputWriter", mapper_params={ "input_reader": { "bucket_name": bucket, "objects": paths, }, }, reducer_params={ "output_writer": { "bucket_name": bucket, "content_type": "text/plain", }, }, shards=1) # Return back the final output results. yield PipelineReturnConsolidatedResults(readsetId, sequenceName, sequenceStart, sequenceEnd, output)
def run(self, job_name, sequence_num, kwargs, namespace, complete_fn): with Namespace(namespace): db.run_in_transaction( DurableJobEntity._start_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, [])) output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_name, sequence_num, namespace, output, complete_fn, kwargs)
def run(self, job_id, job_class_str, kwargs): job_class = mapreduce_util.for_name(job_class_str) job_class.register_start(job_id, metadata={ job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id }) # TODO(sll): Need try/except/mark-as-canceled here? output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_id, job_class_str, output)
def run(self, job_name, kwargs, namespace): time_started = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._start_job, job_name, MapReduceJob.build_output(self.root_pipeline_id, [])) output = yield mapreduce_pipeline.MapreducePipeline(**kwargs) yield StoreMapReduceResults(job_name, time_started, namespace, output)
def testMapReduce(self): # Prepare test data bucket_name = "testbucket" job_name = "test_job" entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, "bucket_name": bucket_name }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. temp_file_stats = cloudstorage.listbucket("/" + bucket_name) for stat in temp_file_stats: if stat.filename: self.assertFalse( stat.filename.startswith("/%s/%s-shuffle-" % (bucket_name, job_name)))
def run(self): yield mapreduce_pipeline.MapreducePipeline( "items_job", "dataflow_pipeline.mapper", "dataflow_pipeline.reducer", "mapreduce.input_readers.DatastoreInputReader", mapper_params={ "input_reader": { "entity_kind": "models.Transaction" } }, shards=1)
def testMapReduceWithShardRetry(self): # Prepare test data bucket_name = "testbucket" entity_count = 200 db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(__name__ + ".TestFileRecordsOutputWriter"), mapper_params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] retries = 0 for output_file in p.outputs.default.value: # Get the number of shard retries by parsing filename. retries += (int(output_file[-1]) - 1) with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) # Assert file names also suggest the right number of retries. self.assertEquals(44, retries) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def run(self, job_name, params, shard_count): yield mapreduce_pipeline.MapreducePipeline( job_name, __name__ + "._extact_domain_map", __name__ + "._grouped_domain_reduce", "mapreduce.input_readers.DatastoreInputReader", "mapreduce.output_writers.BlobstoreOutputWriter", mapper_params=params, reducer_params={ "mime_type": "text/plain", }, shards=shard_count)
def CreatePopularPagesPipeline(start_datetime): return mapreduce_pipeline.MapreducePipeline( 'popular-pages', FullName(recommendations.PopularPagesMap), FullName(recommendations.PopularPagesReduce), 'mapreduce.input_readers.DatastoreInputReader', mapper_params={ 'entity_kind': FullName(models.PageRating), 'start_datetime': SerializeDatetime(start_datetime) }, reducer_params={'start_datetime': SerializeDatetime(start_datetime)}, shards=DEFAULT_SHARDS)
def run(self): yield mapreduce_pipeline.MapreducePipeline( 'IndexingMapReduce', 'index_mapreduce.index.index_map', 'index_mapreduce.index.index_reduce', 'mapreduce.input_readers.DatastoreInputReader', 'mapreduce.output_writers.BlobstoreOutputWriter', mapper_params={ 'entity_kind': 'models.Feed', }, reducer_params={'mime_type': 'text/plain'}, shards=4)
def testCombiner(self): """Test running with low values count but with combiner.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", combiner_spec=__name__ + ".TestCombiner", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".GoogleCloudStorageOutputWriter", mapper_params={ "entity_kind": __name__ + ".TestEntity", }, reducer_params={ "output_writer": { "bucket_name": "testbucket" }, }, shards=4) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(4, len(p.outputs.default.value)) file_content = [] for input_file in p.outputs.default.value: with cloudstorage.open(input_file) as infile: for line in infile: file_content.append(line.strip()) file_content = sorted(file_content) self.assertEquals( ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content) self.assertTrue(TestCombiner.invocations) for invocation in TestCombiner.invocations: key = invocation[0] values = invocation[1] self.assertTrue(key) self.assertTrue(values) self.assertEquals(1, len(values)) self.assertTrue(int(values[0]) % 4 == int(key))
def run(self, list_id, entity_kind, query_pickle): yield mapreduce_pipeline.MapreducePipeline( 'cache_list_items', 'soc.mapreduce.cache_list_items.mapProcess', 'soc.mapreduce.cache_list_items.reduceProcess', 'mapreduce.input_readers.DatastoreInputReader', mapper_params={ 'list_id': list_id, 'entity_kind': entity_kind, 'query_pickle': query_pickle }, reducer_params={'list_id': list_id}, shards=_NO_OF_SHARDS)
def start_count_subscriptions(): """Kicks off the MapReduce for determining and saving subscription counts.""" job = mapreduce_pipeline.MapreducePipeline( 'Count subscriptions', 'offline_jobs.count_subscriptions_for_topic', 'offline_jobs.save_subscription_counts_for_topic', 'mapreduce.input_readers.DatastoreInputReader', mapper_params=dict(entity_kind='main.Subscription'), shards=4) # TODO(bslatkin): Pass through the queue name to run the job on. This is # a limitation in the mapper library. job.start() return job.pipeline_id
def DjangoModelMapreduce( model, mapper, reducer, keys_only=False, output_writer="mapreduce.output_writers.BlobstoreOutputWriter", extra_mapper_params=None, extra_reducer_params=None, shards=None): """ A simple wrapper function for creating mapreduce jobs over a Django model. Args: model: A Django model class mapper: A top-level function that takes a single argument, and yields zero or many two-tuples strings reducer: A top-level function that takes two arguments and yields zero or more values output_writer: An optional OutputWriter subclass name, defaults to 'mapreduce.output_writers.BlobstoreOutputWriter' extra_mapper_params: An optional dictionary of values to pass to the Mapper extra_reducer_params: An optional dictionary of values to pass to the Reducer """ if keys_only: input_reader_spec = "mapreduce.input_readers.DatastoreKeyInputReader" mapper_params = {"entity_kind": model._meta.db_table} else: input_reader_spec = "djangoappengine.mapreduce.input_readers.DjangoModelInputReader" mapper_params = {"entity_kind": _convert_model_to_string(model)} if extra_mapper_params: mapper_params.update(extra_mapper_params) reducer_params = {"mime_type": "text/plain"} if extra_reducer_params: reducer_params.update(extra_reducer_params) mapper_spec = _convert_func_to_string(mapper) reducer_spec = _convert_func_to_string(reducer) return mapreduce_pipeline.MapreducePipeline( "%s-%s-%s-mapreduce" % (model._meta.object_name, mapper_spec, reducer_spec), mapper_spec, reducer_spec, input_reader_spec, output_writer, mapper_params=mapper_params, reducer_params=reducer_params, shards=shards)