def testHandlerSerialization(self): """Test serializable handler works with MR and shard retry.""" entity_count = 10 for _ in range(entity_count): TestEntity(int_property=-1).put() # Force handler to serialize on every call. parameters.config._SLICE_DURATION_SEC = 0 control.start_map( "test_map", __name__ + ".SerializableHandler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count=1, base_path="/mapreduce_base_path") task_run_counts = test_support.execute_until_empty(self.taskqueue) self.assertEquals( task_run_counts[handlers.MapperWorkerCallbackHandler], # Shard retries + one per entity + one to exhaust input reader + one for # finalization. SerializableHandler.FAILURES_INDUCED_BY_INSTANCE + entity_count + 1 + 1) vals = [e.int_property for e in TestEntity.all()] vals.sort() # SerializableHandler updates int_property to be incremental from 0 to 9. self.assertEquals(range(10), vals)
def _run_test(self, num_shards, num_files): bucket_name = "testing" object_prefix = "file-" job_name = "test_map" input_class = (input_readers.__name__ + "." + input_readers._GoogleCloudStorageInputReader.__name__) expected_content = self.create_test_content(bucket_name, object_prefix, num_files) control.start_map( job_name, __name__ + "." + "_input_reader_memory_mapper", input_class, { "input_reader": { "bucket_name": bucket_name, "objects": [object_prefix + "*"] }, }, shard_count=num_shards) test_support.execute_until_empty(self.taskqueue) self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
def start_map( fbl, name, handler_spec, entity_kind, filters=None, handle_batch_size=None, output_writer_spec=None, output_writer=None, queue='slow-queue', extra_mapper_params=None, randomize_tokens=True, ): filters = filters or [] output_writer = output_writer or {} extra_mapper_params = extra_mapper_params or {} mapper_params = { 'entity_kind': entity_kind, 'handle_batch_size': handle_batch_size, 'filters': filters, 'output_writer': output_writer, } mapper_params.update(get_fblookup_params(fbl, randomize_tokens=randomize_tokens)) mapper_params.update(extra_mapper_params) control.start_map( name=name, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec=handler_spec, output_writer_spec=output_writer_spec, shard_count=16, # since we want to stick it in the slow-queue, and don't care how fast it executes queue_name=queue, mapper_parameters=mapper_params, )
def get(self): mapreduce_params = { 'entity_kind': 'models.Topic', } control.start_map("DeleteOldUpdates", "mapjob.keep_thirty_updates", "mapreduce.input_readers.DatastoreInputReader", mapreduce_params, 2) self.response.out.write("ok")
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_data = [str(i) for i in range(100)] bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) control.start_map( "test_map", __name__ + ".TestHandler", input_readers.__name__ + ".GoogleCloudStorageRecordInputReader", { "input_reader": { "bucket_name": bucket_name, "objects": [test_filename], # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) } }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
def testRecordsReader(self): """End-to-end test for records reader.""" input_data = [str(i) for i in range(100)] bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) control.start_map("test_map", __name__ + ".TestHandler", input_readers.__name__ + ".GoogleCloudStorageRecordInputReader", { "input_reader": { "bucket_name": bucket_name, "objects": [test_filename] } }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def testHandlerSerialization(self): """Test serializable handler works with MR and shard retry.""" entity_count = 10 for _ in range(entity_count): TestEntity(int_property=-1).put() # Force handler to serialize on every call. parameters.config._SLICE_DURATION_SEC = 0 control.start_map( "test_map", __name__ + ".SerializableHandler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count=1, base_path="/mapreduce_base_path") task_run_counts = test_support.execute_until_empty(self.taskqueue) self.assertEquals( task_run_counts[handlers.MapperWorkerCallbackHandler], # Shard retries + one per entity + one to exhaust input reader SerializableHandler.FAILURES_INDUCED_BY_INSTANCE + entity_count + 1) vals = [e.int_property for e in TestEntity.all()] vals.sort() # SerializableHandler updates int_property to be incremental from 0 to 9. self.assertEquals(range(10), vals)
def testHandlerSerialization(self): """Test serializable handler works with MR and shard retry.""" entity_count = 10 for _ in range(entity_count): TestEntity(int_property=-1).put() # Force handler to serialize on every call. handlers._SLICE_DURATION_SEC = 0 control.start_map( "test_map", __name__ + ".SerializableHandler", "mapreduce.input_readers.DatastoreInputReader", {"entity_kind": __name__ + "." + TestEntity.__name__}, shard_count=1, base_path="/mapreduce_base_path", ) task_run_counts = test_support.execute_until_empty(self.taskqueue) self.assertEquals( task_run_counts[handlers.MapperWorkerCallbackHandler], entity_count + 1 + SerializableHandler.TASKS_CONSUMED_BY_RETRY, ) vals = [e.int_property for e in TestEntity.all()] vals.sort() # SerializableHandler updates int_property to be incremental from 0 to 9. self.assertEquals(range(10), vals)
def start_map( fbl, name, handler_spec, entity_kind, filters=None, handle_batch_size=None, output_writer_spec=None, output_writer=None, queue='slow-queue', extra_mapper_params=None, randomize_tokens=True, ): filters = filters or [] output_writer = output_writer or {} extra_mapper_params = extra_mapper_params or {} mapper_params = { 'entity_kind': entity_kind, 'handle_batch_size': handle_batch_size, 'filters': filters, 'output_writer': output_writer, } mapper_params.update( get_fblookup_params(fbl, randomize_tokens=randomize_tokens)) mapper_params.update(extra_mapper_params) control.start_map( name=name, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec=handler_spec, output_writer_spec=output_writer_spec, shard_count= 16, # since we want to stick it in the slow-queue, and don't care how fast it executes queue_name=queue, mapper_parameters=mapper_params, )
def get(self): # TODO(qyearsley): Add test coverage. See catapult:#1346. name = "Update test deprecation status." handler = "dashboard.mr.DeprecateTestsMapper" reader = "mapreduce.input_readers.DatastoreInputReader" mapper_parameters = {"entity_kind": ("dashboard.models.graph_data.TestMetadata"), "filters": []} mr_control.start_map(name, handler, reader, mapper_parameters)
def get(self): name = 'Update test deprecation status.' handler = ('dashboard.mr.DeprecateTestsMapper') reader = 'mapreduce.input_readers.DatastoreInputReader' mapper_parameters = { 'entity_kind': ('dashboard.models.graph_data.TestMetadata'), 'filters': [], } mr_control.start_map(name, handler, reader, mapper_parameters)
def get(self): name = 'Update anomalies with units.' handler = ('dashboard.mr.StoreUnitsInAnomalyEntity') reader = 'mapreduce.input_readers.DatastoreInputReader' mapper_parameters = { 'entity_kind': ('dashboard.models.graph_data.Anomaly'), 'filters': [], } mr_control.start_map(name, handler, reader, mapper_parameters)
def get(self): control.start_map( name='Compute User-Event Stats', reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='users.user_event_tasks.map_compute_user_stats', mapper_parameters={'entity_kind': 'users.users.User'}, queue_name='fast-queue', shard_count=5, )
def get(self): # TODO(qyearsley): Add test coverage. See http://crbug.com/447432 name = 'Update test deprecation status.' handler = ('dashboard.mr.DeprecateTestsMapper') reader = 'mapreduce.input_readers.DatastoreInputReader' mapper_parameters = { 'entity_kind': ('dashboard.models.graph_data.Test'), 'filters': [('has_rows', '=', True), ('deprecated', '=', False)], } mr_control.start_map(name, handler, reader, mapper_parameters)
def delete_all_questions(): logging.info("Delete all existing questions") ctrl.start_map("Delete all Question entities", 'locql.delete_entity', 'mapreduce.input_readers.DatastoreKeyInputReader', {'entity_kind': 'locql.Question'}) ctrl.start_map("Delete all TermStat entities", 'locql.delete_entity', 'mapreduce.input_readers.DatastoreKeyInputReader', {'entity_kind': 'locql.TermStat'})
def get(self): # TODO(qyearsley): Add test coverage. See catapult:#1346. name = 'Update test deprecation status.' handler = ('dashboard.mr.DeprecateTestsMapper') reader = 'mapreduce.input_readers.DatastoreInputReader' mapper_parameters = { 'entity_kind': ('dashboard.models.graph_data.TestMetadata'), 'filters': [('has_rows', '=', True), ('deprecated', '=', False)], } mr_control.start_map(name, handler, reader, mapper_parameters)
def get(self): table = 'rankings.cities.City' control.start_map( name='Delete %s' % table, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.servlets.tools.delete_table', mapper_parameters={ 'entity_kind': table, }, )
def get(self): control.start_map( name='Fixup Events', reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.servlets.tools.resave_object', mapper_parameters={ 'entity_kind': 'dancedeets.events.eventdata.DBEvent', }, shard_count=16, )
def get(self): table = self.request.get('table') # users.users.User or events.eventdata.DBEvent or ... control.start_map( name='Resave %s' % table, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='servlets.tools.resave_table', mapper_parameters={ 'entity_kind': table, }, )
def get(self): mr_control.start_map( self.request.get("name"), self.request.get("reader_spec", "your_mapreduce.map"), self.request.get("reader_parameters", "mapreduce.input_readers.DatastoreInputReader"), { "entity_kind": self.request.get("entity_kind", "models.YourModel"), "processing_rate": int(self.request.get("processing_rate", 100)) }, mapreduce_parameters={"done_callback": self.request.get("done_callback", None) } ) self.response.out.write("MapReduce scheduled");
def begin_user_ranking_calculations(): control.start_map( name='Compute City Rankings by Users', reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.rankings.rankings.count_user_for_city', mapper_parameters={'entity_kind': 'dancedeets.users.users.User'}, queue_name='fast-queue', shard_count=16, _app=USER_FOR_CITY_RANKING, ) _compute_summary(expiry=5 * 60) # 5 minutes
def get(self): table = self.request.get( 'table') # users.users.User or events.eventdata.DBEvent or ... control.start_map( name='Resave %s' % table, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.servlets.tools.resave_table', mapper_parameters={ 'entity_kind': table, }, )
def start_map(name, params=None, eta=None, countdown=None): for config in status.MapReduceYaml.to_dict(status.get_mapreduce_yaml()): if config.get('name') == name: break # Add the mapreduce specific parameters to the params dictionary config['mapper_params'].update(params if params else {}) control.start_map(config['name'], config['mapper_handler'], config['mapper_input_reader'], config['mapper_params'], eta=eta, countdown=countdown)
def mr_import_fc_file(blob_key): logging.info('mr_import_fc_file start bk:%s' % (blob_key)) mr_ctl.start_map( name='Import a fortune file', handler_spec='main.mr_import_fc_line', reader_spec='mapreduce.input_readers.BlobstoreLineInputReader', mapper_parameters={ # FIXME add blob key to done callback 'done_callback': '/mr_done', 'blob_keys': str(blob_key), }, ) logging.info('mr_import_fc_file end')
def get(self): processing_rate = 3 shard_count = 2 control.start_map( "Iterate over all Dummy objects in the DB", # this an arbitrary description string "tasks.mapper_function", # this is the function that will bne "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": "dummy.Dummy", # the model that you will iterate over "processing_rate": processing_rate, # how many entities will each shard process }, shard_count=shard_count, # how many shards will be created by every MR controller queue_name="default", # the name of the queue that will be used for this MR's jobs, I used default to minimize config )
def run(self, job_name, handler_spec, input_reader_spec, output_writer_spec=None, params=None, shards=None): """ Overwriting this method allows us to pass the base_path properly, I know it's stupid but I think this is the cleanest way that still gives us a working Pipeline that we can chain """ if shards is None: shards = parameters.config.SHARD_COUNT mapreduce_id = control.start_map( job_name, handler_spec, input_reader_spec, params or {}, mapreduce_parameters={ "done_callback": self.get_callback_url(), "done_callback_method": "GET", "pipeline_id": self.pipeline_id, "base_path": BASE_PATH, }, shard_count=shards, output_writer_spec=output_writer_spec, queue_name=self.queue_name, ) self.fill(self.outputs.job_id, mapreduce_id) self.set_status(console_url="%s/detail?mapreduce_id=%s" % ((parameters.config.BASE_PATH, mapreduce_id)))
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(set(filenames))) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) self.assertEqual(1000, sum(file_lengths))
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(filenames)) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name, job_name))) data = cloudstorage.open(filenames[shard]).read() # strip() is used to remove the last newline of each file so that split() # does not retrun extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries)
def begin_event_ranking_calculations(vertical): filters = [('verticals', '=', vertical)] control.start_map( name='Compute City Rankings by %s Events' % vertical, reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.rankings.rankings.count_event_for_city', mapper_parameters={ 'entity_kind': 'dancedeets.events.eventdata.DBEvent', 'filters': filters, }, queue_name='fast-queue', shard_count=16, _app=_get_app_id(EVENT_FOR_CITY_RANKING, vertical), ) _compute_summary(expiry=5 * 60) # 5 minutes
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
def testStartMap_RaisingHooks(self): """Tests that MR can be scheduled with a dummy hook class installed. The dummy hook class raises NotImplementedError for all method calls so the default scheduling logic should be used. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name="crazy-queue", hooks_class_name=hooks.__name__ + "." + hooks.Hooks.__name__) self.validate_map_started(mapreduce_id)
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(filenames)) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) self.assertEqual(1000, sum(file_lengths))
def testStartMap_Eta(self): """Test that MR can be scheduled into the future. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() # MR should be scheduled into the future. eta = datetime.datetime.utcnow() + datetime.timedelta(hours=1) shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name=self.QUEUE_NAME, eta=eta) task_eta = self.validate_map_started(mapreduce_id) self.assertEquals(eta.strftime("%Y/%m/%d %H:%M:%S"), task_eta)
def testStartMap_Hooks(self): """Tests that MR can be scheduled with a hook class installed. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name="crazy-queue", hooks_class_name=__name__ + "." + TestHooks.__name__) self.assertTrue(mapreduce_id) task, queue_name = TestHooks.enqueue_kickoff_task_calls[0] self.assertEqual( "/mapreduce_base_path/kickoffjob_callback/" + mapreduce_id, task.url) self.assertEqual("crazy-queue", queue_name)
def testStartMap_Countdown(self): """Test that MR can be scheduled into the future. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() # MR should be scheduled into the future. now_sec = long(time.time()) shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name=self.QUEUE_NAME, countdown=1000) task_eta = self.validate_map_started(mapreduce_id) eta_sec = time.mktime(time.strptime(task_eta, "%Y/%m/%d %H:%M:%S")) self.assertTrue(now_sec + 1000 <= eta_sec)
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(set(filenames))) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith( "/%s/%s" % (bucket_name, job_name))) data = cloudstorage.open(filenames[shard]).read() # strip() is used to remove the last newline of each file so that split() # does not retrun extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries)
def testDedicatedParams(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, "output_writer": { "filesystem": "gs", "gs_bucket_name": "bucket", }, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=FILE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.FileOutputWriter.get_filenames( mapreduce_state) self.assertEqual(1, len(filenames)) self.assertTrue(filenames[0].startswith("/gs/bucket/")) with files.open(filenames[0], "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
def testSingleShard(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(1, len(filenames)) blob_name = filenames[0] self.assertTrue(blob_name.startswith("/blobstore/")) self.assertFalse(blob_name.startswith("/blobstore/writable:")) with files.open(blob_name, "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def run(self, job_name, handler_spec, input_reader_spec, output_writer_spec=None, params=None, shards=None): """ Overwriting this method allows us to pass the base_path properly, I know it's stupid but I think this is the cleanest way that still gives us a working Pipeline that we can chain """ if shards is None: shards = parameters.config.SHARD_COUNT mapreduce_id = control.start_map( job_name, handler_spec, input_reader_spec, params or {}, mapreduce_parameters={ "done_callback": self.get_callback_url(), "done_callback_method": "GET", "pipeline_id": self.pipeline_id, "base_path": BASE_PATH, }, shard_count=shards, output_writer_spec=output_writer_spec, queue_name=self.queue_name, ) self.fill(self.outputs.job_id, mapreduce_id) self.set_status(console_url="%s/detail?mapreduce_id=%s" % ( (parameters.config.BASE_PATH, mapreduce_id)))
def post(self): """ Generate data sets here """ if self.request.get("generate"): # For SDK only generate 1k and less num_entries = int(self.request.get("num_entries")) user = self.request.get("user") name = self.request.get("name") char_per_word = int(self.request.get("char_per_word")) entries_pp = int(self.request.get("entries_per_pipe")) route = gen_data(num_entries, user, name, char_per_word, entries_pp) self.redirect('/data/wc') # pipeline seems broken #self.redirect(route) elif self.request.get("delete"): name = self.request.get("name") dataset = WCDataSet.get_by_key_name(name) num_entries = dataset.num_entries mapreduce_id = control.start_map( name="Word removal", handler_spec="data.wordcount.delete_dataset", reader_spec="mapreduce.input_readers.DatastoreInputReader", mapper_parameters={ "entity_kind": "data.wordcount." + get_word_class(num_entries), "processing_rate": 200 }, shard_count=64, mapreduce_parameters={model.MapreduceSpec.PARAM_DONE_CALLBACK: '/data/wc/delete_callback'}, queue_name="default", ) dataset.state = "Deleting" dataset.mr_id = mapreduce_id dataset.put() self.redirect('/data/wc')
def testStartMap_RaisingHooks(self): """Tests that MR can be scheduled with a dummy hook class installed. The dummy hook class raises NotImplementedError for all method calls so the default scheduling logic should be used. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name="crazy-queue", hooks_class_name=hooks.__name__+"."+hooks.Hooks.__name__) self.validate_map_started(mapreduce_id)
def testStartMap_Hooks(self): """Tests that MR can be scheduled with a hook class installed. Most of start_map functionality is already tested by handlers_test. Just a smoke test is enough. """ TestEntity().put() shard_count = 4 mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count, mapreduce_parameters={"foo": "bar"}, base_path="/mapreduce_base_path", queue_name="crazy-queue", hooks_class_name=__name__+"."+TestHooks.__name__) self.assertTrue(mapreduce_id) task, queue_name = TestHooks.enqueue_kickoff_task_calls[0] self.assertEquals("/mapreduce_base_path/kickoffjob_callback", task.url) self.assertEquals("crazy-queue", queue_name)
def testDedicatedParams(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, "output_writer": { "filesystem": "gs", "gs_bucket_name": "bucket", }, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=FILE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state) self.assertEqual(1, len(filenames)) self.assertTrue(filenames[0].startswith("/gs/bucket/")) with files.open(filenames[0], "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def get(self): token_nickname = self.request.get('token_nickname', None) mapper_params = { 'entity_kind': 'dancedeets.events.eventdata.DBEvent', 'handle_batch_size': 20, 'filters': [('search_time_period', '=', dates.TIME_FUTURE)], 'token_nickname': token_nickname, } control.start_map( name='Post Future Japan Events', reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='dancedeets.pubsub.pubsub_tasks.map_post_jp_event', shard_count= 8, # since we want to stick it in the slow-queue, and don't care how fast it executes queue_name='fast-queue', mapper_parameters=mapper_params, )
def mr_delete_bad_sources(): mapper_params = { 'entity_kind': 'event_scraper.thing_db.Source', 'output_writer': { 'mime_type': 'text/plain', 'bucket_name': 'dancedeets-hrd.appspot.com', }, } control.start_map( name='Delete Bad Sources', reader_spec='mapreduce.input_readers.DatastoreInputReader', handler_spec='event_scraper.thing_scraper.delete_bad_source', output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', shard_count=8, queue_name='fast-queue', mapper_parameters=mapper_params, )
def UpdateUserTestBeaconCounts(request): """Starts mapreducer.UserTestBeaconCount.""" mr_id = control.start_map( 'UserTest beacon_count update', 'base.mapreducer.UserTestBeaconCount', 'mapreduce.input_readers.DatastoreInputReader', {'entity_kind': 'models.user_test.Test'}) return http.HttpResponse('Started MR w/ ID:%s' % mr_id)
def testLotsOfNdbEntities(self): entity_count = 1000 for _ in range(entity_count): NdbTestEntity().put() control.start_map( "test_map", __name__ + ".TestHandler", input_readers.__name__ + ".DatastoreInputReader", { "entity_kind": __name__ + "." + NdbTestEntity.__name__, }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(entity_count, len(TestHandler.processed_entites))