Python parse_raw_log_lines 예제들, dagster_graphql.client.util.parse_raw_log_lines Python 예제들

예제 #1

0

파일 보기

    def execute(self, context):
        if "run_id" in self.params:
            self._run_id = self.params["run_id"]
        elif "dag_run" in context and context["dag_run"] is not None:
            self._run_id = context["dag_run"].run_id

        try:
            if self.instance:
                tags = {AIRFLOW_EXECUTION_DATE_STR: context.get("ts")} if "ts" in context else {}

                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    run_config=self.run_config,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=tags,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = self.execute_raw(context)
            self.log.info("Finished executing container.")

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, "executePlan")
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(sys.exc_info())
                        ),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None

예제 #2

0

파일 보기

파일: test_util.py 프로젝트: spencer-zepelin/dagster

def test_parse_raw_log_lines():
    raw_log_lines = [
        'Some extraneous log line',
        'Another extraneous log line, JSON data is on the next line',
        '{"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}}',
    ]

    result = {'data': {'executePlan': {'__typename': 'ExecutePlanSuccess'}}}
    assert parse_raw_log_lines(raw_log_lines) == result

예제 #3

0

파일 보기

파일: test_util.py 프로젝트: sd2k/dagster

def test_parse_raw_log_lines():
    raw_log_lines = [
        "Some extraneous log line",
        "Another extraneous log line, JSON data is on the next line",
        '{"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}}',
    ]

    result = {"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}}
    assert parse_raw_log_lines(raw_log_lines) == result

예제 #4

0

파일 보기

파일: test.py 프로젝트: varokas/dagster-1

def wait_for_job_and_get_logs(job_name, namespace):
    '''Wait for a dagster-k8s job to complete, ensure it launched only one pod,
    and then grab the logs from the pod it launched.
    '''
    check.str_param(job_name, 'job_name')
    check.str_param(namespace, 'namespace')

    wait_for_job_success(job_name, namespace=namespace)

    pod_names = get_pod_names_in_job(job_name, namespace)

    assert len(pod_names) == 1

    pod_name = pod_names[0]

    raw_logs = retrieve_pod_logs(pod_name, namespace=namespace)
    return parse_raw_log_lines(raw_logs.split('\n'))

예제 #5

0

파일 보기

파일: tasks.py 프로젝트: aaronsql2019/dagster

    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        environment_dict,
        mode,
        pipeline_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''
        from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job
        from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success

        import kubernetes

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time'
        )
        check.dict_param(environment_dict, 'environment_dict')
        check.str_param(mode, 'mode')
        check.str_param(pipeline_name, 'pipeline_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')
        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)
        job_name = 'dagster-stepjob-%s' % k8s_name_key
        pod_name = 'dagster-stepjob-%s' % k8s_name_key

        variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys)
        args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), 'Image pull secrets'
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), 'Service account name'
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobEngine,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_keys[0],
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)

        wait_for_job_success(job.metadata.name, namespace=job_namespace)
        pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobEngine,
            step_key=step_keys[0],
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)

        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events

예제 #6

0

파일 보기

    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.')

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        try:
            if self.instance:
                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    environment_dict=self.environment_dict,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=None,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = super(DagsterDockerOperator, self).execute(context)
            self.log.info('Finished executing container.')

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, 'executePlan')
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None

예제 #7

0

파일 보기

파일: kubernetes_operator.py 프로젝트: wingyplus/dagster

    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be'
                ' installed in your Airflow environment.'
            )

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        # return to original execute code:
        try:
            client = kube_client.get_kube_client(
                in_cluster=self.in_cluster,
                cluster_context=self.cluster_context,
                config_file=self.config_file,
            )
            gen = pod_generator.PodGenerator()

            for mount in self.volume_mounts:
                gen.add_mount(mount)
            for volume in self.volumes:
                gen.add_volume(volume)

            pod = gen.make_pod(
                namespace=self.namespace,
                image=self.image,
                pod_id=self.name,
                cmds=self.cmds,
                arguments=self.query(context.get('ts')),
                labels=self.labels,
            )

            pod.service_account_name = self.service_account_name
            pod.secrets = self.secrets
            pod.envs = self.env_vars
            pod.image_pull_policy = self.image_pull_policy
            pod.image_pull_secrets = self.image_pull_secrets
            pod.annotations = self.annotations
            pod.resources = self.resources
            pod.affinity = self.affinity
            pod.node_selectors = self.node_selectors
            pod.hostnetwork = self.hostnetwork
            pod.tolerations = self.tolerations
            pod.configmaps = self.configmaps
            pod.security_context = self.security_context

            launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push)
            try:
                if self.instance:
                    tags = (
                        {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {}
                    )

                    run = self.instance.register_managed_run(
                        pipeline_name=self.pipeline_name,
                        run_id=self.run_id,
                        run_config=self.run_config,
                        mode=self.mode,
                        solids_to_execute=None,
                        step_keys_to_execute=None,
                        tags=tags,
                        root_run_id=None,
                        parent_run_id=None,
                        pipeline_snapshot=self.pipeline_snapshot,
                        execution_plan_snapshot=self.execution_plan_snapshot,
                        parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                    )

                # we won't use the "result", which is the pod's xcom json file
                (final_state, _) = launcher.run_pod(
                    pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs
                )

                # fetch the last line independently of whether logs were read
                # unbelievably, if you set tail_lines=1, the returned json has its double quotes
                # turned into unparseable single quotes
                res = None
                num_attempts = 0
                while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS:
                    raw_res = client.read_namespaced_pod_log(
                        name=pod.name, namespace=pod.namespace, container='base'
                    )
                    res = parse_raw_log_lines(raw_res.split('\n'))
                    time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC)
                    num_attempts += 1

                try:
                    handle_execution_errors(res, 'executePlan')
                except DagsterGraphQLClientError as err:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(sys.exc_info())
                        ),
                        self.__class__,
                    )
                    raise

                events = handle_execute_plan_result_raw(res)

                if self.instance:
                    for event in events:
                        self.instance.handle_new_event(event)

                events = [e.dagster_event for e in events]
                check_events_for_failures(events)
                check_events_for_skips(events)
                return events

            finally:
                self._run_id = None

                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException('Pod returned a failure: {state}'.format(state=final_state))
            # note the lack of returning the default xcom
        except AirflowException as ex:
            raise AirflowException('Pod Launching failed: {error}'.format(error=ex))

예제 #8

0

파일 보기

    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName':
                    repo_location_name,
                    'repositoryName':
                    repo_name,
                    'pipelineName':
                    pipeline_run.pipeline_name,
                    'solidSelection':
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            },
            'retries': retries.to_graphql_input(),
        }
        args = [
            '-p', 'executePlan', '-v',
            seven.json.dumps(variables), '--remap-sigterm'
        ]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name,
                                                resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(
            body=job, namespace=job_namespace)

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)
        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events