示例#1
0
    def execute(self, context: 'Context') -> int:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        if self.do_xcom_push:
            context['ti'].xcom_push(key='cluster_id', value=self.cluster_id)

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.cluster_id,
        )

        self.log.info('Modifying cluster %s', self.cluster_id)
        response = emr.modify_cluster(
            ClusterId=self.cluster_id,
            StepConcurrencyLevel=self.step_concurrency_level)

        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise AirflowException(f'Modify cluster failed: {response}')
        else:
            self.log.info('Steps concurrency level %d',
                          response['StepConcurrencyLevel'])
            return response['StepConcurrencyLevel']
示例#2
0
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Poking step %s on cluster %s', self.step_id,
                      self.job_flow_id)
        return emr.describe_step(ClusterId=self.job_flow_id,
                                 StepId=self.step_id)
示例#3
0
    def execute(self, context: 'Context') -> str:
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(
                self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow creation failed: {response}')
        else:
            job_flow_id = response['JobFlowId']
            self.log.info('JobFlow with id %s created', job_flow_id)
            EmrClusterLink.persist(
                context=context,
                operator=self,
                region_name=emr.conn_region_name,
                aws_partition=emr.conn_partition,
                job_flow_id=job_flow_id,
            )
            return job_flow_id
示例#4
0
    def execute(self, context: 'Context') -> List[str]:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(
            str(self.job_flow_name), self.cluster_states)

        if not job_flow_id:
            raise AirflowException(
                f'No cluster found for name: {self.job_flow_name}')

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        self.log.info('Adding steps to %s', job_flow_id)

        # steps may arrive as a string representing a list
        # e.g. if we used XCom or a file then: steps="[{ step1 }, { step2 }]"
        steps = self.steps
        if isinstance(steps, str):
            steps = ast.literal_eval(steps)

        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'Adding steps failed: {response}')
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
示例#5
0
    def test_create_job_flow_uses_the_emr_config_to_create_a_cluster(self):
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        cluster = hook.create_job_flow({'Name': 'test_cluster'})

        self.assertEqual(client.list_clusters()['Clusters'][0]['Id'],
                         cluster['JobFlowId'])
示例#6
0
    def execute(self, context: 'Context') -> None:
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow termination failed: {response}')
        else:
            self.log.info('JobFlow with id %s terminated', self.job_flow_id)
示例#7
0
    def get_emr_response(self) -> Dict[str, Any]:
        """
        Make an API call with boto3 and get cluster-level details.

        .. seealso::
            https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Client.describe_cluster

        :return: response
        :rtype: dict[str, Any]
        """
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Poking cluster %s', self.job_flow_id)
        return emr.describe_cluster(ClusterId=self.job_flow_id)
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)
        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
示例#9
0
    def get_hook(self) -> EmrHook:
        """Get EmrHook"""
        if self.hook:
            return self.hook

        self.hook = EmrHook(aws_conn_id=self.aws_conn_id)
        return self.hook
示例#10
0
    def execute(self, context: Dict[str, Any]) -> int:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        if self.do_xcom_push:
            context['ti'].xcom_push(key='cluster_id', value=self.cluster_id)

        self.log.info('Modifying cluster %s', self.cluster_id)
        response = emr.modify_cluster(
            ClusterId=self.cluster_id, StepConcurrencyLevel=self.step_concurrency_level
        )

        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise AirflowException(f'Modify cluster failed: {response}')
        else:
            self.log.info('Steps concurrency level %d', response['StepConcurrencyLevel'])
            return response['StepConcurrencyLevel']
示例#11
0
    def test_get_cluster_id_by_name(self):
        """
        Test that we can resolve cluster id by cluster name.
        """
        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')

        job_flow = hook.create_job_flow({'Name': 'test_cluster',
                                         'Instances': {'KeepJobFlowAliveWhenNoSteps': True}})

        job_flow_id = job_flow['JobFlowId']

        matching_cluster = hook.get_cluster_id_by_name('test_cluster', ['RUNNING', 'WAITING'])

        self.assertEqual(matching_cluster, job_flow_id)

        no_match = hook.get_cluster_id_by_name('foo', ['RUNNING', 'WAITING', 'BOOTSTRAPPING'])

        self.assertIsNone(no_match)
示例#12
0
    def execute(self, context: 'Context') -> None:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.job_flow_id,
        )

        self.log.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow termination failed: {response}')
        else:
            self.log.info('JobFlow with id %s terminated', self.job_flow_id)
示例#13
0
    def test_create_job_flow_extra_args(self):
        """
        Test that we can add extra arguments to the launch call.

        This is useful for when AWS add new options, such as
        "SecurityConfiguration" so that we don't have to change our code
        """
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        # AmiVersion is really old and almost no one will use it anymore, but
        # it's one of the "optional" request params that moto supports - it's
        # coverage of EMR isn't 100% it turns out.
        cluster = hook.create_job_flow({'Name': 'test_cluster',
                                        'ReleaseLabel': '',
                                        'AmiVersion': '3.2'})

        cluster = client.describe_cluster(ClusterId=cluster['JobFlowId'])['Cluster']

        # The AmiVersion comes back as {Requested,Running}AmiVersion fields.
        self.assertEqual(cluster['RequestedAmiVersion'], '3.2')
示例#14
0
    def execute(self, context: Dict[str, Any]) -> str:
        emr = EmrHook(
            aws_conn_id=self.aws_conn_id, emr_conn_id=self.emr_conn_id, region_name=self.region_name
        )

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s', self.aws_conn_id, self.emr_conn_id
        )

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
示例#15
0
    def execute(self, context):
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(
            self.job_flow_name, self.cluster_states)
        if not job_flow_id:
            raise AirflowException(
                f'No cluster found for name: {self.job_flow_name}')

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        self.log.info('Adding steps to %s', job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=job_flow_id,
                                          Steps=self.steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('Adding steps failed: %s' % response)
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
示例#16
0
 def test_get_conn_returns_a_boto3_connection(self):
     hook = EmrHook(aws_conn_id='aws_default', region_name='ap-southeast-2')
     self.assertIsNotNone(hook.get_conn().list_clusters())
示例#17
0
 def get_hook(self):
     """Get EmrHook"""
     if not self.hook:
         self.hook = EmrHook(aws_conn_id=self.aws_conn_id)
     return self.hook
示例#18
0
    def execute(self, context):
        # define hooks
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name='us-west-2')

        self.log.info('Defining JobFlow...')

        SPARK_STEPS = [{
            'Name': "copy-files-" + time.strftime("%Y%m%d-%H:%M"),
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    "aws", "s3", "cp",
                    f"s3://{Variable.get('S3_CODE_BUCKET_NAME')}/",
                    "/home/hadoop/", "--recursive"
                ]
            }
        }, {
            'Name': 'run-etl-' + time.strftime("%Y%m%d-%H:%M"),
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    'spark-submit', '/home/hadoop/spark_etl.py', '-q',
                    Variable.get('S3_RAW_QUOTES_BUCKET_NAME'), '-op',
                    Variable.get('S3_RAW_OPTIONS_BUCKET_NAME'), '-d',
                    context['ds_nodash'], '-o',
                    Variable.get('S3_DATA_BUCKET_NAME')
                ]
            }
        }]

        s_s32log = 's3n://aws-logs-345196100842-us-west-2/elasticmapreduce/'
        self.job_flow_overrides = {
            'Name': 'etl-process',
            'LogUri': s_s32log,
            'ReleaseLabel': 'emr-5.20.0',
            'Instances': {
                'InstanceGroups': [{
                    "InstanceCount": 2,
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [{
                            "VolumeSpecification": {
                                "SizeInGB": 32,
                                "VolumeType": "gp2"
                            },
                            "VolumesPerInstance": 1
                        }]
                    },
                    'Market': 'SPOT',
                    "InstanceRole": "CORE",
                    "InstanceType": "m5.xlarge",
                    "Name": "Core Instance Group"
                }, {
                    "InstanceCount": 1,
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [{
                            "VolumeSpecification": {
                                "SizeInGB": 32,
                                "VolumeType": "gp2"
                            },
                            "VolumesPerInstance": 1
                        }]
                    },
                    'Market': 'SPOT',
                    "InstanceRole": "MASTER",
                    "InstanceType": "m5.xlarge",
                    "Name": "Master Instance Group"
                }],
                'KeepJobFlowAliveWhenNoSteps':
                False,
                'TerminationProtected':
                False,
            },
            'Steps': SPARK_STEPS,
            'JobFlowRole': 'EMR_EC2_DefaultRole',
            'ServiceRole': 'EMR_DefaultRole',
        }

        self.log.info('Creating JobFlow...')

        if isinstance(self.job_flow_overrides, str):
            self.job_flow_overrides = ast.literal_eval(self.job_flow_overrides)

        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']