def test_run_wms_task(self, mock_chain):

        celery_uid = str(uuid.uuid4())
        provider = DataProvider.objects.get(slug='wms')
        provider_task_record = DataProviderTask.objects.create(
            provider=provider)
        self.job.provider_tasks.add(provider_task_record)
        # celery chain mock
        mock_chain.return_value.apply_async.return_value = Mock()
        create_run(job_uid=self.job.uid)
        task_chain_builder = TaskChainBuilder()
        # Even though code using pipes seems to be supported here it is throwing an error.
        try:
            task_chain_builder.build_tasks(
                mapproxy_export_task,
                provider_task_uid=provider_task_record.uid,
                run=self.job.runs.first(),
                service_type='wms',
                worker="some_worker")
        except TypeError:
            pass
        run = self.job.runs.first()
        self.assertIsNotNone(run)
        tasks = run.provider_tasks.first().tasks.filter(
            name='Raster export (.gpkg)')
        self.assertIsNotNone(tasks)
示例#2
0
    def test_run_osm_task(self, mock_chain):
        provider = DataProvider.objects.get(slug="osm")
        provider_task = DataProviderTask.objects.create(provider=provider)
        provider_task.formats.add(self.shp_task)
        provider_task.save()
        self.job.data_provider_tasks.add(provider_task)
        create_run(job=self.job)

        task_chain_builder = TaskChainBuilder()

        # Even though code using pipes seems to be supported here it is throwing an error.
        try:
            task_chain_builder.build_tasks(
                osm_data_collection_task,
                provider_task_uid=provider_task.uid,
                run=self.job.runs.first(),
                worker="some_worker",
            )
        except TypeError:
            pass
        run = self.job.runs.first()
        self.assertIsNotNone(run)
        tasks = run.data_provider_task_records.first().tasks.filter(
            name="OSM(.gpkg)")
        self.assertIsNotNone(tasks)
示例#3
0
 def test_run_wcs_task(self, mock_chain):
     provider = DataProvider.objects.get(slug="wms")
     provider_task_record = DataProviderTask.objects.create(
         provider=provider)
     self.job.data_provider_tasks.add(provider_task_record)
     # celery chain mock
     mock_chain.return_value.apply_async.return_value = Mock()
     create_run(job=self.job)
     task_chain_builder = TaskChainBuilder()
     # Even though code using pipes seems to be supported here it is throwing an error.
     try:
         task_chain_builder.build_tasks(
             wcs_export_task,
             provider_task_uid=provider_task_record.uid,
             run=self.job.runs.first(),
             service_type="wcs",
             worker="some_worker",
         )
     except TypeError:
         pass
     run = self.job.runs.first()
     self.assertIsNotNone(run)
     tasks = run.data_provider_task_records.first().tasks.filter(
         name="Geotiff Format (.tif)")
     self.assertIsNotNone(tasks)
    def test_run_osm_task(self, mock_chain):
        provider = DataProvider.objects.get(slug='osm')
        provider_task = DataProviderTask.objects.create(provider=provider)
        provider_task.formats.add(self.shp_task)
        provider_task.save()
        self.job.provider_tasks.add(provider_task)
        create_run(job_uid=self.job.uid)

        task_chain_builder = TaskChainBuilder()

        # Even though code using pipes seems to be supported here it is throwing an error.
        try:
            task_chain_builder.build_tasks(osm_data_collection_task,
                                           provider_task_uid=provider_task.uid, run=self.job.runs.first(),
                                           worker="some_worker")
        except TypeError:
            pass
        run = self.job.runs.first()
        self.assertIsNotNone(run)
        tasks = run.provider_tasks.first().tasks.filter(name='OSM(.gpkg)')
        self.assertIsNotNone(tasks)
    def test_run_wcs_task(self, mock_chain):

        celery_uid = str(uuid.uuid4())
        provider = DataProvider.objects.get(slug='wms')
        provider_task_record = DataProviderTask.objects.create(provider=provider)
        self.job.provider_tasks.add(provider_task_record)
        # celery chain mock
        mock_chain.return_value.apply_async.return_value = Mock()
        create_run(job_uid=self.job.uid)
        task_chain_builder = TaskChainBuilder()
        # Even though code using pipes seems to be supported here it is throwing an error.
        try:
            task_chain_builder.build_tasks(wcs_export_task,
                                           provider_task_uid=provider_task_record.uid, run=self.job.runs.first(),
                                           service_type='wcs',
                                           worker="some_worker")
        except TypeError:
            pass
        run = self.job.runs.first()
        self.assertIsNotNone(run)
        tasks = run.provider_tasks.first().tasks.filter(name='Geotiff Format (.tif)')
        self.assertIsNotNone(tasks)
示例#6
0
    def parse_tasks(
        self,
        worker=None,
        run_uid=None,
        user_details=None,
        run_zip_file_slug_sets=None,
        session_token=None,
        queue_group=None,
    ):
        """
        This handles all of the logic for taking the information about what individual celery tasks and groups
        them under specific providers.

        Each Provider (e.g. OSM) gets a chain:  OSM_TASK -> FORMAT_TASKS = PROVIDER_SUBTASK_CHAIN
        They need to be finalized (was the task successful?) to update the database state:
            PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK

        We also have an optional chain of tasks that get processed after the providers are run:
            AD_HOC_TASK1 -> AD_HOC_TASK2 -> FINALIZE_RUN_TASK = FINALIZE_RUN_TASK_COLLECTION

        If the PROVIDER_SUBTASK_CHAIN fails it needs to be cleaned up.  The clean up task also calls the
        finalize provider task. This is because when a task fails the failed task will call an on_error (link_error)
        task and never return.
            PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK
                   |
                   v
                CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK

        Now there needs to be someway for the finalize tasks to be called.  Since we now have several a possible
        forked path, we need each path to check the state of the providers to see if they are all finished before
        moving on.
        It would be great if celery would implicitly handled that, but it doesn't ever merge the forked paths.
        So we add a WAIT_FOR_PROVIDERS task to check state once the providers are ready they call the final tasks.

        PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS   \
                   |                                                              ==> FINALIZE_RUN_TASK_COLLECTION
                   v                                                             /
            CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS


        :param worker: A worker node (hostname) for a celery worker, this should match the node name used when starting,
         the celery worker.
        :param run_uid: A uid to reference an ExportRun.
        :return: The AsyncResult from the celery chain of all tasks for this run.
        """
        # This is just to make it easier to trace when user_details haven't been sent
        if user_details is None:
            user_details = {"username": "******"}

        if not run_uid:
            raise Exception("Cannot parse_tasks without a run uid.")

        run = ExportRun.objects.prefetch_related(
            "job__projections", "job__data_provider_tasks", "data_provider_task_records"
        ).get(uid=run_uid)
        job = run.job
        run_dir = get_run_staging_dir(run.uid)

        wait_for_providers_settings = {
            "queue": f"{queue_group}.priority",
            "routing_key": f"{queue_group}.priority",
            "priority": TaskPriority.FINALIZE_PROVIDER.value,
        }

        finalize_task_settings = {
            "interval": 4,
            "max_retries": 10,
            "queue": f"{queue_group}.priority",
            "routing_key": f"{queue_group}.priority",
            "priority": TaskPriority.FINALIZE_RUN.value,
        }

        finalized_provider_task_chain_list = []
        # Create a task record which can hold tasks for the run (datapack)
        run_task_record, created = DataProviderTaskRecord.objects.get_or_create(
            run=run, name="run", slug="run", defaults={"status": TaskState.PENDING.value, "display": False}
        )
        if created:
            logger.info("New data provider task record created")
            run_task_record.status = TaskState.PENDING.value
            run_task_record.save()

        run_zip_task_chain = get_zip_task_chain(
            data_provider_task_record_uid=run_task_record.uid,
            worker=worker,
        )
        for data_provider_task in job.data_provider_tasks.all():

            data_provider_task_record = run.data_provider_task_records.filter(
                provider__slug=data_provider_task.provider.slug
            ).first()
            if (
                data_provider_task_record
                and TaskState[data_provider_task_record.status] in TaskState.get_finished_states()
            ):
                continue

            if self.type_task_map.get(data_provider_task.provider.export_provider_type.type_name):
                # Each task builder has a primary task which pulls the source data, grab that task here...
                type_name = data_provider_task.provider.export_provider_type.type_name

                primary_export_task = self.type_task_map.get(type_name)

                stage_dir = get_provider_staging_dir(run_dir, data_provider_task.provider.slug)
                args = {
                    "primary_export_task": primary_export_task,
                    "user": job.user,
                    "provider_task_uid": data_provider_task.uid,
                    "stage_dir": stage_dir,
                    "run": run,
                    "service_type": data_provider_task.provider.export_provider_type.type_name,
                    "worker": worker,
                    "user_details": user_details,
                    "session_token": session_token,
                }

                (
                    provider_task_record_uid,
                    provider_subtask_chain,
                ) = TaskChainBuilder().build_tasks(**args)

                wait_for_providers_signature = wait_for_providers_task.s(
                    run_uid=run_uid,
                    locking_task_key=run_uid,
                    callback_task=create_finalize_run_task_collection(
                        run_uid=run_uid,
                        run_provider_task_record_uid=run_task_record.uid,
                        run_zip_task_chain=run_zip_task_chain,
                        run_zip_file_slug_sets=run_zip_file_slug_sets,
                        apply_args=finalize_task_settings,
                    ),
                    apply_args=finalize_task_settings,
                ).set(**wait_for_providers_settings)

                if provider_subtask_chain:
                    # The finalize_export_provider_task will check all of the export tasks
                    # for this provider and save the export provider's status.

                    selection_task = create_task(
                        data_provider_task_record_uid=provider_task_record_uid,
                        worker=worker,
                        stage_dir=stage_dir,
                        task=output_selection_geojson_task,
                        selection=job.the_geom.geojson,
                        user_details=user_details,
                    )

                    # create signature to close out the provider tasks
                    finalize_export_provider_signature = finalize_export_provider_task.s(
                        data_provider_task_uid=provider_task_record_uid,
                        status=TaskState.COMPLETED.value,
                        locking_task_key=run_uid,
                    ).set(**finalize_task_settings)

                    # add zip if required
                    # skip zip if there is only one source in the data pack (they would be redundant files).
                    if data_provider_task.provider.zip and len(job.data_provider_tasks.all()) > 1:
                        zip_export_provider_sig = get_zip_task_chain(
                            data_provider_task_record_uid=provider_task_record_uid,
                            data_provider_task_record_uids=[provider_task_record_uid],
                            worker=worker,
                        )
                        provider_subtask_chain = chain(provider_subtask_chain, zip_export_provider_sig)

                    finalized_provider_task_chain_list.append(
                        chain(
                            selection_task,
                            provider_subtask_chain,
                            finalize_export_provider_signature,
                            wait_for_providers_signature,
                        )
                    )

        # we kick off all of the sub-tasks at once down here rather than one at a time in the for loop above so
        # that if an error occurs earlier on in the method, all of the tasks will fail rather than an undefined
        # number of them. this simplifies error handling, because we don't have to deduce which tasks were
        # successfully kicked off and which ones failed.
        for item in finalized_provider_task_chain_list:
            item.apply_async(**finalize_task_settings)
示例#7
0
    def parse_tasks(self, worker=None, run_uid=None, user_details=None):
        """
        This handles all of the logic for taking the information about what individual celery tasks and groups them under
        specific providers.

        Each Provider (e.g. OSM) gets a chain:  OSM_TASK -> FORMAT_TASKS = PROVIDER_SUBTASK_CHAIN
        They need to be finalized (was the task successful?) to update the database state:
            PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK

        We also have an optional chain of tasks that get processed after the providers are run:
            AD_HOC_TASK1 -> AD_HOC_TASK2 -> FINALIZE_RUN_TASK = FINALIZE_RUN_TASK_COLLECTION

        If the PROVIDER_SUBTASK_CHAIN fails it needs to be cleaned up.  The clean up task also calls the finalize provider
        task. This is because when a task fails the failed task will call an on_error (link_error) task and never return.
        task. This is because when a task fails the failed task will call an on_error (link_error) task and never return.
            PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK
                   |
                   v
                CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK

        Now there needs to be someway for the finalize tasks to be called.  Since we now have several a possible forked path,
        we need each path to check the state of the providers to see if they are all finished before moving on.
        It would be great if celery would implicitly handled that, but it doesn't ever merge the forked paths.
        So we add a WAIT_FOR_PROVIDERS task to check state once the providers are ready they call the final tasks.

        PROVIDER_SUBTASK_CHAIN -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS   \
                   |                                                              ==> FINALIZE_RUN_TASK_COLLECTION
                   v                                                             /
            CLEAN_UP_FAILURE_TASK -> FINALIZE_PROVIDER_TASK -> WAIT_FOR_PROVIDERS


        :param worker: A worker node (hostname) for a celery worker, this should match the node name used when starting,
         the celery worker.
        :param run_uid: A uid to reference an ExportRun.
        :return: The AsyncResult from the celery chain of all tasks for this run.
        """
        # This is just to make it easier to trace when user_details haven't been sent
        if user_details is None:
            user_details = {'username': '******'}

        if run_uid:
            run = ExportRun.objects.get(uid=run_uid)
            job = run.job
            run_dir = get_run_staging_dir(run.uid)
            os.makedirs(run_dir, 0o750)

            wait_for_providers_settings = {
                'queue': "{}.finalize".format(worker),
                'routing_key': "{}.finalize".format(worker),
                'priority': TaskPriority.FINALIZE_PROVIDER.value
            }

            finalize_task_settings = {
                'interval': 4,
                'max_retries': 10,
                'queue': "{}.finalize".format(worker),
                'routing_key': "{}.finalize".format(worker),
                'priority': TaskPriority.FINALIZE_RUN.value
            }

            finalized_provider_task_chain_list = []
            # Create a task record which can hold tasks for the run (datapack)
            run_task_record = DataProviderTaskRecord.objects.create(
                run=run,
                name="run",
                slug="run",
                status=TaskStates.PENDING.value,
                display=False)
            stage_dir = get_provider_staging_dir(run_dir, run_task_record.slug)
            os.makedirs(stage_dir, 6600)
            run_zip_task_chain = get_zip_task_chain(
                data_provider_task_uid=run_task_record.uid,
                stage_dir=get_run_staging_dir(run_uid),
                worker=worker)
            for provider_task_record in job.provider_tasks.all():

                if self.type_task_map.get(provider_task_record.provider.
                                          export_provider_type.type_name):
                    # Each task builder has a primary task which pulls the source data, grab that task here...
                    type_name = provider_task_record.provider.export_provider_type.type_name

                    primary_export_task = self.type_task_map.get(type_name)

                    stage_dir = get_provider_staging_dir(
                        run_dir, provider_task_record.provider.slug)
                    os.makedirs(stage_dir, 6600)

                    args = {
                        'primary_export_task':
                        primary_export_task,
                        'user':
                        job.user,
                        'provider_task_uid':
                        provider_task_record.uid,
                        'run':
                        run,
                        'stage_dir':
                        stage_dir,
                        'service_type':
                        provider_task_record.provider.export_provider_type.
                        type_name,
                        'worker':
                        worker,
                        'user_details':
                        user_details
                    }

                    provider_task_uid, provider_subtask_chain = TaskChainBuilder(
                    ).build_tasks(**args)

                    wait_for_providers_signature = wait_for_providers_task.s(
                        run_uid=run_uid,
                        locking_task_key=run_uid,
                        callback_task=create_finalize_run_task_collection(
                            run_uid,
                            run_dir,
                            run_zip_task_chain,
                            apply_args=finalize_task_settings),
                        apply_args=finalize_task_settings).set(
                            **wait_for_providers_settings)

                    if provider_subtask_chain:
                        # The finalize_export_provider_task will check all of the export tasks
                        # for this provider and save the export provider's status.

                        selection_task = create_task(
                            data_provider_task_uid=provider_task_uid,
                            stage_dir=stage_dir,
                            worker=worker,
                            task=output_selection_geojson_task,
                            selection=job.the_geom.geojson,
                            user_details=user_details)

                        # create signature to close out the provider tasks
                        finalize_export_provider_signature = finalize_export_provider_task.s(
                            data_provider_task_uid=provider_task_uid,
                            status=TaskStates.COMPLETED.value,
                            locking_task_key=run_uid)

                        # add zip if required
                        if provider_task_record.provider.zip:
                            zip_export_provider_sig = get_zip_task_chain(
                                data_provider_task_uid=provider_task_uid,
                                stage_dir=stage_dir,
                                worker=worker)
                            provider_subtask_chain = chain(
                                provider_subtask_chain,
                                zip_export_provider_sig)

                        finalized_provider_task_chain_list.append(
                            chain(selection_task, provider_subtask_chain,
                                  finalize_export_provider_signature,
                                  wait_for_providers_signature))

            # we kick off all of the sub-tasks at once down here rather than one at a time in the for loop above so
            # that if an error occurs earlier on in the method, all of the tasks will fail rather than an undefined
            # number of them. this simplifies error handling, because we don't have to deduce which tasks were
            # successfully kicked off and which ones failed.
            for item in finalized_provider_task_chain_list:
                item.apply_async(**finalize_task_settings)