Exemplo n.º 1
0
 def get_step_dict(step_id, step_state):
     return {
         "Step": {
             "Id": step_id,
             "Name": step_name,
             "Config": {"Jar": "command-runner.jar", "Properties": {}, "Args": step_cmd},
             "ActionOnFailure": "CONTINUE",
             "Status": {
                 "State": step_state,
                 "StateChangeReason": {"Message": "everything is hosed"},
                 "Timeline": {"StartDateTime": _boto3_now()},
             },
         },
     }
Exemplo n.º 2
0
 def get_step_dict(step_id, step_state):
     return {
         'Step': {
             'Id': step_id,
             'Name': step_name,
             'Config': {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd},
             'ActionOnFailure': 'CONTINUE',
             'Status': {
                 'State': step_state,
                 'StateChangeReason': {'Message': 'everything is hosed'},
                 'Timeline': {'StartDateTime': _boto3_now()},
             },
         },
     }
Exemplo n.º 3
0
def test_boto3_now():
    assert (
        time.mktime(_boto3_now().timetuple()) - time.mktime(datetime.now(tzutc()).timetuple()) < EPS
    )
Exemplo n.º 4
0
Arquivo: emr.py Projeto: sd2k/dagster
    def is_emr_step_complete(self, log, cluster_id, emr_step_id):
        step = self.describe_step(cluster_id, emr_step_id)["Step"]
        step_state = EmrStepState(step["Status"]["State"])

        if step_state == EmrStepState.Pending:
            cluster = self.describe_cluster(cluster_id)["Cluster"]

            reason = _get_reason(cluster)
            reason_desc = (": %s" % reason) if reason else ""

            log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))
            return False

        elif step_state == EmrStepState.Running:
            time_running_desc = ""

            start = step["Status"]["Timeline"].get("StartDateTime")
            if start:
                time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)

            log.info("RUNNING%s" % time_running_desc)
            return False

        # we're done, will return at the end of this
        elif step_state == EmrStepState.Completed:
            log.info("COMPLETED")
            return True
        else:
            # step has failed somehow. *reason* seems to only be set
            # when job is cancelled (e.g. 'Job terminated')
            reason = _get_reason(step)
            reason_desc = (" (%s)" % reason) if reason else ""

            log.info("%s%s" % (step_state.value, reason_desc))

            # print cluster status; this might give more context
            # why step didn't succeed
            cluster = self.describe_cluster(cluster_id)["Cluster"]
            reason = _get_reason(cluster)
            reason_desc = (": %s" % reason) if reason else ""
            log.info(
                "Cluster %s %s %s%s"
                % (
                    cluster["Id"],
                    "was" if "ED" in cluster["Status"]["State"] else "is",
                    cluster["Status"]["State"],
                    reason_desc,
                )
            )

            if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:
                # was it caused by IAM roles?
                self._check_for_missing_default_iam_roles(log, cluster)

                # TODO: extract logs here to surface failure reason
                # See: https://github.com/dagster-io/dagster/issues/1954

        if step_state == EmrStepState.Failed:
            log.info("EMR step %s failed" % emr_step_id)

        raise EmrError("EMR step failed")
Exemplo n.º 5
0
    def _wait_for_step_to_complete(self, context, cluster_id, step_id):
        '''Helper for _wait_for_steps_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.
        '''
        check.str_param(cluster_id, 'cluster_id')
        check.str_param(step_id, 'step_id')

        emr_client = self.make_emr_client()

        while True:
            # don't antagonize EMR's throttling
            context.log.debug('Waiting %.1f seconds...' % self.check_cluster_every)
            time.sleep(self.check_cluster_every)

            step = emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)['Step']
            step_state = EmrStepState(step['Status']['State'])

            if step_state == EmrStepState.Pending:
                cluster = self.describe_cluster(cluster_id)

                reason = _get_reason(cluster)
                reason_desc = (': %s' % reason) if reason else ''

                context.log.info(
                    'PENDING (cluster is %s%s)' % (cluster['Status']['State'], reason_desc)
                )
                continue

            elif step_state == EmrStepState.Running:
                time_running_desc = ''

                start = step['Status']['Timeline'].get('StartDateTime')
                if start:
                    time_running_desc = ' for %s' % strip_microseconds(_boto3_now() - start)

                context.log.info('RUNNING%s' % time_running_desc)
                continue

            # we're done, will return at the end of this
            elif step_state == EmrStepState.Completed:
                context.log.info('COMPLETED')
                return
            else:
                # step has failed somehow. *reason* seems to only be set
                # when job is cancelled (e.g. 'Job terminated')
                reason = _get_reason(step)
                reason_desc = (' (%s)' % reason) if reason else ''

                context.log.info('%s%s' % (step_state.value, reason_desc))

                # print cluster status; this might give more context
                # why step didn't succeed
                cluster = self.describe_cluster(cluster_id)
                reason = _get_reason(cluster)
                reason_desc = (': %s' % reason) if reason else ''
                context.log.info(
                    'Cluster %s %s %s%s'
                    % (
                        cluster['Id'],
                        'was' if 'ED' in cluster['Status']['State'] else 'is',
                        cluster['Status']['State'],
                        reason_desc,
                    )
                )

                if EmrClusterState(cluster['Status']['State']) in EMR_CLUSTER_TERMINATED_STATES:
                    # was it caused by IAM roles?
                    self._check_for_missing_default_iam_roles(context, cluster)

                    # TODO: extract logs here to surface failure reason
                    # See: https://github.com/dagster-io/dagster/issues/1954

            if step_state == EmrStepState.Failed:
                context.log.info('Step %s failed' % step_id)

            raise Exception('step failed')
Exemplo n.º 6
0
    def is_emr_step_complete(self, log, cluster_id, emr_step_id):
        step = self.describe_step(cluster_id, emr_step_id)['Step']
        step_state = EmrStepState(step['Status']['State'])

        if step_state == EmrStepState.Pending:
            cluster = self.describe_cluster(cluster_id)['Cluster']

            reason = _get_reason(cluster)
            reason_desc = (': %s' % reason) if reason else ''

            log.info('PENDING (cluster is %s%s)' %
                     (cluster['Status']['State'], reason_desc))
            return False

        elif step_state == EmrStepState.Running:
            time_running_desc = ''

            start = step['Status']['Timeline'].get('StartDateTime')
            if start:
                time_running_desc = ' for %s' % strip_microseconds(
                    _boto3_now() - start)

            log.info('RUNNING%s' % time_running_desc)
            return False

        # we're done, will return at the end of this
        elif step_state == EmrStepState.Completed:
            log.info('COMPLETED')
            return True
        else:
            # step has failed somehow. *reason* seems to only be set
            # when job is cancelled (e.g. 'Job terminated')
            reason = _get_reason(step)
            reason_desc = (' (%s)' % reason) if reason else ''

            log.info('%s%s' % (step_state.value, reason_desc))

            # print cluster status; this might give more context
            # why step didn't succeed
            cluster = self.describe_cluster(cluster_id)['Cluster']
            reason = _get_reason(cluster)
            reason_desc = (': %s' % reason) if reason else ''
            log.info('Cluster %s %s %s%s' % (
                cluster['Id'],
                'was' if 'ED' in cluster['Status']['State'] else 'is',
                cluster['Status']['State'],
                reason_desc,
            ))

            if EmrClusterState(cluster['Status']
                               ['State']) in EMR_CLUSTER_TERMINATED_STATES:
                # was it caused by IAM roles?
                self._check_for_missing_default_iam_roles(log, cluster)

                # TODO: extract logs here to surface failure reason
                # See: https://github.com/dagster-io/dagster/issues/1954

        if step_state == EmrStepState.Failed:
            log.info('EMR step %s failed' % emr_step_id)

        raise EmrError('EMR step failed')
Exemplo n.º 7
0
    def _wait_for_emr_step_to_complete(self, log, cluster_id, emr_step_id):
        '''Helper for wait_for_steps_to_complete(). Wait for step with the given ID to complete.

        Args:
            cluster_id (str): The ID of the cluster
            emr_step_id (str): EMR Step ID to wait for

        Raises:
            EmrError: Raised when the step is marked by EMR as failed instead of completing
                successfully.
        '''
        check.str_param(cluster_id, 'cluster_id')
        check.str_param(emr_step_id, 'emr_step_id')

        while True:
            # don't antagonize EMR's throttling
            log.debug('Waiting %.1f seconds...' % self.check_cluster_every)
            time.sleep(self.check_cluster_every)

            step = self.describe_step(cluster_id, emr_step_id)['Step']
            step_state = EmrStepState(step['Status']['State'])

            if step_state == EmrStepState.Pending:
                cluster = self.describe_cluster(cluster_id)['Cluster']

                reason = _get_reason(cluster)
                reason_desc = (': %s' % reason) if reason else ''

                log.info('PENDING (cluster is %s%s)' % (cluster['Status']['State'], reason_desc))
                continue

            elif step_state == EmrStepState.Running:
                time_running_desc = ''

                start = step['Status']['Timeline'].get('StartDateTime')
                if start:
                    time_running_desc = ' for %s' % strip_microseconds(_boto3_now() - start)

                log.info('RUNNING%s' % time_running_desc)
                continue

            # we're done, will return at the end of this
            elif step_state == EmrStepState.Completed:
                log.info('COMPLETED')
                return
            else:
                # step has failed somehow. *reason* seems to only be set
                # when job is cancelled (e.g. 'Job terminated')
                reason = _get_reason(step)
                reason_desc = (' (%s)' % reason) if reason else ''

                log.info('%s%s' % (step_state.value, reason_desc))

                # print cluster status; this might give more context
                # why step didn't succeed
                cluster = self.describe_cluster(cluster_id)['Cluster']
                reason = _get_reason(cluster)
                reason_desc = (': %s' % reason) if reason else ''
                log.info(
                    'Cluster %s %s %s%s'
                    % (
                        cluster['Id'],
                        'was' if 'ED' in cluster['Status']['State'] else 'is',
                        cluster['Status']['State'],
                        reason_desc,
                    )
                )

                if EmrClusterState(cluster['Status']['State']) in EMR_CLUSTER_TERMINATED_STATES:
                    # was it caused by IAM roles?
                    self._check_for_missing_default_iam_roles(log, cluster)

                    # TODO: extract logs here to surface failure reason
                    # See: https://github.com/dagster-io/dagster/issues/1954

            if step_state == EmrStepState.Failed:
                log.info('EMR step %s failed' % emr_step_id)

            raise EmrError('EMR step failed')