def test_run_wms_task(self, mock_chain): celery_uid = str(uuid.uuid4()) provider = DataProvider.objects.get(slug='wms') provider_task_record = DataProviderTask.objects.create( provider=provider) self.job.provider_tasks.add(provider_task_record) # celery chain mock mock_chain.return_value.apply_async.return_value = Mock() create_run(job_uid=self.job.uid) task_chain_builder = TaskChainBuilder() # Even though code using pipes seems to be supported here it is throwing an error. try: task_chain_builder.build_tasks( mapproxy_export_task, provider_task_uid=provider_task_record.uid, run=self.job.runs.first(), service_type='wms', worker="some_worker") except TypeError: pass run = self.job.runs.first() self.assertIsNotNone(run) tasks = run.provider_tasks.first().tasks.filter( name='Raster export (.gpkg)') self.assertIsNotNone(tasks)
def test_run_osm_task(self, mock_chain): provider = DataProvider.objects.get(slug="osm") provider_task = DataProviderTask.objects.create(provider=provider) provider_task.formats.add(self.shp_task) provider_task.save() self.job.data_provider_tasks.add(provider_task) create_run(job=self.job) task_chain_builder = TaskChainBuilder() # Even though code using pipes seems to be supported here it is throwing an error. try: task_chain_builder.build_tasks( osm_data_collection_task, provider_task_uid=provider_task.uid, run=self.job.runs.first(), worker="some_worker", ) except TypeError: pass run = self.job.runs.first() self.assertIsNotNone(run) tasks = run.data_provider_task_records.first().tasks.filter( name="OSM(.gpkg)") self.assertIsNotNone(tasks)
def test_run_wcs_task(self, mock_chain): provider = DataProvider.objects.get(slug="wms") provider_task_record = DataProviderTask.objects.create( provider=provider) self.job.data_provider_tasks.add(provider_task_record) # celery chain mock mock_chain.return_value.apply_async.return_value = Mock() create_run(job=self.job) task_chain_builder = TaskChainBuilder() # Even though code using pipes seems to be supported here it is throwing an error. try: task_chain_builder.build_tasks( wcs_export_task, provider_task_uid=provider_task_record.uid, run=self.job.runs.first(), service_type="wcs", worker="some_worker", ) except TypeError: pass run = self.job.runs.first() self.assertIsNotNone(run) tasks = run.data_provider_task_records.first().tasks.filter( name="Geotiff Format (.tif)") self.assertIsNotNone(tasks)
def test_run_osm_task(self, mock_chain): provider = DataProvider.objects.get(slug='osm') provider_task = DataProviderTask.objects.create(provider=provider) provider_task.formats.add(self.shp_task) provider_task.save() self.job.provider_tasks.add(provider_task) create_run(job_uid=self.job.uid) task_chain_builder = TaskChainBuilder() # Even though code using pipes seems to be supported here it is throwing an error. try: task_chain_builder.build_tasks(osm_data_collection_task, provider_task_uid=provider_task.uid, run=self.job.runs.first(), worker="some_worker") except TypeError: pass run = self.job.runs.first() self.assertIsNotNone(run) tasks = run.provider_tasks.first().tasks.filter(name='OSM(.gpkg)') self.assertIsNotNone(tasks)
def test_run_wcs_task(self, mock_chain): celery_uid = str(uuid.uuid4()) provider = DataProvider.objects.get(slug='wms') provider_task_record = DataProviderTask.objects.create(provider=provider) self.job.provider_tasks.add(provider_task_record) # celery chain mock mock_chain.return_value.apply_async.return_value = Mock() create_run(job_uid=self.job.uid) task_chain_builder = TaskChainBuilder() # Even though code using pipes seems to be supported here it is throwing an error. try: task_chain_builder.build_tasks(wcs_export_task, provider_task_uid=provider_task_record.uid, run=self.job.runs.first(), service_type='wcs', worker="some_worker") except TypeError: pass run = self.job.runs.first() self.assertIsNotNone(run) tasks = run.provider_tasks.first().tasks.filter(name='Geotiff Format (.tif)') self.assertIsNotNone(tasks)
def parse_tasks( self, worker=None, run_uid=None, user_details=None, run_zip_file_slug_sets=None, session_token=None, queue_group=None, ): """ This handles all of the logic for taking the information about what individual celery tasks and groups them under specific providers. Each Provider (e.g. OSM) gets a chain: OSM_TASK -> FORMAT_TASKS = PROVIDER_SUBTASK_CHAIN They need to be finalized (was the task successful?) to update the database state: PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK We also have an optional chain of tasks that get processed after the providers are run: AD_HOC_TASK1 -> AD_HOC_TASK2 -> FINALIZE_RUN_TASK = FINALIZE_RUN_TASK_COLLECTION If the PROVIDER_SUBTASK_CHAIN fails it needs to be cleaned up. The clean up task also calls the finalize provider task. This is because when a task fails the failed task will call an on_error (link_error) task and never return. PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK | v CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK Now there needs to be someway for the finalize tasks to be called. Since we now have several a possible forked path, we need each path to check the state of the providers to see if they are all finished before moving on. It would be great if celery would implicitly handled that, but it doesn't ever merge the forked paths. So we add a WAIT_FOR_PROVIDERS task to check state once the providers are ready they call the final tasks. PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS \ | ==> FINALIZE_RUN_TASK_COLLECTION v / CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS :param worker: A worker node (hostname) for a celery worker, this should match the node name used when starting, the celery worker. :param run_uid: A uid to reference an ExportRun. :return: The AsyncResult from the celery chain of all tasks for this run. """ # This is just to make it easier to trace when user_details haven't been sent if user_details is None: user_details = {"username": "******"} if not run_uid: raise Exception("Cannot parse_tasks without a run uid.") run = ExportRun.objects.prefetch_related( "job__projections", "job__data_provider_tasks", "data_provider_task_records" ).get(uid=run_uid) job = run.job run_dir = get_run_staging_dir(run.uid) wait_for_providers_settings = { "queue": f"{queue_group}.priority", "routing_key": f"{queue_group}.priority", "priority": TaskPriority.FINALIZE_PROVIDER.value, } finalize_task_settings = { "interval": 4, "max_retries": 10, "queue": f"{queue_group}.priority", "routing_key": f"{queue_group}.priority", "priority": TaskPriority.FINALIZE_RUN.value, } finalized_provider_task_chain_list = [] # Create a task record which can hold tasks for the run (datapack) run_task_record, created = DataProviderTaskRecord.objects.get_or_create( run=run, name="run", slug="run", defaults={"status": TaskState.PENDING.value, "display": False} ) if created: logger.info("New data provider task record created") run_task_record.status = TaskState.PENDING.value run_task_record.save() run_zip_task_chain = get_zip_task_chain( data_provider_task_record_uid=run_task_record.uid, worker=worker, ) for data_provider_task in job.data_provider_tasks.all(): data_provider_task_record = run.data_provider_task_records.filter( provider__slug=data_provider_task.provider.slug ).first() if ( data_provider_task_record and TaskState[data_provider_task_record.status] in TaskState.get_finished_states() ): continue if self.type_task_map.get(data_provider_task.provider.export_provider_type.type_name): # Each task builder has a primary task which pulls the source data, grab that task here... type_name = data_provider_task.provider.export_provider_type.type_name primary_export_task = self.type_task_map.get(type_name) stage_dir = get_provider_staging_dir(run_dir, data_provider_task.provider.slug) args = { "primary_export_task": primary_export_task, "user": job.user, "provider_task_uid": data_provider_task.uid, "stage_dir": stage_dir, "run": run, "service_type": data_provider_task.provider.export_provider_type.type_name, "worker": worker, "user_details": user_details, "session_token": session_token, } ( provider_task_record_uid, provider_subtask_chain, ) = TaskChainBuilder().build_tasks(**args) wait_for_providers_signature = wait_for_providers_task.s( run_uid=run_uid, locking_task_key=run_uid, callback_task=create_finalize_run_task_collection( run_uid=run_uid, run_provider_task_record_uid=run_task_record.uid, run_zip_task_chain=run_zip_task_chain, run_zip_file_slug_sets=run_zip_file_slug_sets, apply_args=finalize_task_settings, ), apply_args=finalize_task_settings, ).set(**wait_for_providers_settings) if provider_subtask_chain: # The finalize_export_provider_task will check all of the export tasks # for this provider and save the export provider's status. selection_task = create_task( data_provider_task_record_uid=provider_task_record_uid, worker=worker, stage_dir=stage_dir, task=output_selection_geojson_task, selection=job.the_geom.geojson, user_details=user_details, ) # create signature to close out the provider tasks finalize_export_provider_signature = finalize_export_provider_task.s( data_provider_task_uid=provider_task_record_uid, status=TaskState.COMPLETED.value, locking_task_key=run_uid, ).set(**finalize_task_settings) # add zip if required # skip zip if there is only one source in the data pack (they would be redundant files). if data_provider_task.provider.zip and len(job.data_provider_tasks.all()) > 1: zip_export_provider_sig = get_zip_task_chain( data_provider_task_record_uid=provider_task_record_uid, data_provider_task_record_uids=[provider_task_record_uid], worker=worker, ) provider_subtask_chain = chain(provider_subtask_chain, zip_export_provider_sig) finalized_provider_task_chain_list.append( chain( selection_task, provider_subtask_chain, finalize_export_provider_signature, wait_for_providers_signature, ) ) # we kick off all of the sub-tasks at once down here rather than one at a time in the for loop above so # that if an error occurs earlier on in the method, all of the tasks will fail rather than an undefined # number of them. this simplifies error handling, because we don't have to deduce which tasks were # successfully kicked off and which ones failed. for item in finalized_provider_task_chain_list: item.apply_async(**finalize_task_settings)
def parse_tasks(self, worker=None, run_uid=None, user_details=None): """ This handles all of the logic for taking the information about what individual celery tasks and groups them under specific providers. Each Provider (e.g. OSM) gets a chain: OSM_TASK -> FORMAT_TASKS = PROVIDER_SUBTASK_CHAIN They need to be finalized (was the task successful?) to update the database state: PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK We also have an optional chain of tasks that get processed after the providers are run: AD_HOC_TASK1 -> AD_HOC_TASK2 -> FINALIZE_RUN_TASK = FINALIZE_RUN_TASK_COLLECTION If the PROVIDER_SUBTASK_CHAIN fails it needs to be cleaned up. The clean up task also calls the finalize provider task. This is because when a task fails the failed task will call an on_error (link_error) task and never return. task. This is because when a task fails the failed task will call an on_error (link_error) task and never return. PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK | v CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK Now there needs to be someway for the finalize tasks to be called. Since we now have several a possible forked path, we need each path to check the state of the providers to see if they are all finished before moving on. It would be great if celery would implicitly handled that, but it doesn't ever merge the forked paths. So we add a WAIT_FOR_PROVIDERS task to check state once the providers are ready they call the final tasks. PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS \ | ==> FINALIZE_RUN_TASK_COLLECTION v / CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS :param worker: A worker node (hostname) for a celery worker, this should match the node name used when starting, the celery worker. :param run_uid: A uid to reference an ExportRun. :return: The AsyncResult from the celery chain of all tasks for this run. """ # This is just to make it easier to trace when user_details haven't been sent if user_details is None: user_details = {'username': '******'} if run_uid: run = ExportRun.objects.get(uid=run_uid) job = run.job run_dir = get_run_staging_dir(run.uid) os.makedirs(run_dir, 0o750) wait_for_providers_settings = { 'queue': "{}.finalize".format(worker), 'routing_key': "{}.finalize".format(worker), 'priority': TaskPriority.FINALIZE_PROVIDER.value } finalize_task_settings = { 'interval': 4, 'max_retries': 10, 'queue': "{}.finalize".format(worker), 'routing_key': "{}.finalize".format(worker), 'priority': TaskPriority.FINALIZE_RUN.value } finalized_provider_task_chain_list = [] # Create a task record which can hold tasks for the run (datapack) run_task_record = DataProviderTaskRecord.objects.create( run=run, name="run", slug="run", status=TaskStates.PENDING.value, display=False) stage_dir = get_provider_staging_dir(run_dir, run_task_record.slug) os.makedirs(stage_dir, 6600) run_zip_task_chain = get_zip_task_chain( data_provider_task_uid=run_task_record.uid, stage_dir=get_run_staging_dir(run_uid), worker=worker) for provider_task_record in job.provider_tasks.all(): if self.type_task_map.get(provider_task_record.provider. export_provider_type.type_name): # Each task builder has a primary task which pulls the source data, grab that task here... type_name = provider_task_record.provider.export_provider_type.type_name primary_export_task = self.type_task_map.get(type_name) stage_dir = get_provider_staging_dir( run_dir, provider_task_record.provider.slug) os.makedirs(stage_dir, 6600) args = { 'primary_export_task': primary_export_task, 'user': job.user, 'provider_task_uid': provider_task_record.uid, 'run': run, 'stage_dir': stage_dir, 'service_type': provider_task_record.provider.export_provider_type. type_name, 'worker': worker, 'user_details': user_details } provider_task_uid, provider_subtask_chain = TaskChainBuilder( ).build_tasks(**args) wait_for_providers_signature = wait_for_providers_task.s( run_uid=run_uid, locking_task_key=run_uid, callback_task=create_finalize_run_task_collection( run_uid, run_dir, run_zip_task_chain, apply_args=finalize_task_settings), apply_args=finalize_task_settings).set( **wait_for_providers_settings) if provider_subtask_chain: # The finalize_export_provider_task will check all of the export tasks # for this provider and save the export provider's status. selection_task = create_task( data_provider_task_uid=provider_task_uid, stage_dir=stage_dir, worker=worker, task=output_selection_geojson_task, selection=job.the_geom.geojson, user_details=user_details) # create signature to close out the provider tasks finalize_export_provider_signature = finalize_export_provider_task.s( data_provider_task_uid=provider_task_uid, status=TaskStates.COMPLETED.value, locking_task_key=run_uid) # add zip if required if provider_task_record.provider.zip: zip_export_provider_sig = get_zip_task_chain( data_provider_task_uid=provider_task_uid, stage_dir=stage_dir, worker=worker) provider_subtask_chain = chain( provider_subtask_chain, zip_export_provider_sig) finalized_provider_task_chain_list.append( chain(selection_task, provider_subtask_chain, finalize_export_provider_signature, wait_for_providers_signature)) # we kick off all of the sub-tasks at once down here rather than one at a time in the for loop above so # that if an error occurs earlier on in the method, all of the tasks will fail rather than an undefined # number of them. this simplifies error handling, because we don't have to deduce which tasks were # successfully kicked off and which ones failed. for item in finalized_provider_task_chain_list: item.apply_async(**finalize_task_settings)