Exemplo n.º 1
0
class DataprocOperationBaseOperator(BaseOperator):
    """The base class for operators that poll on a Dataproc Operation."""
    @apply_defaults
    def __init__(self,
                 project_id,
                 region='global',
                 gcp_conn_id='google_cloud_default',
                 delegate_to=None,
                 *args,
                 **kwargs):
        super(DataprocOperationBaseOperator, self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.project_id = project_id
        self.region = region
        self.hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                                 delegate_to=self.delegate_to,
                                 api_version='v1beta2')

    def execute(self, context):
        # pylint: disable=no-value-for-parameter
        self.hook.wait(self.start())

    def start(self, context):
        raise AirflowException('Please submit an operation')
    def execute(self, context):
        # add s3 credentials to the job without exposing them in the airflow UI
        properties = {
            "fs.s3a." + key: value
            for key, value in zip(
                ("access.key", "secret.key", "session.token"),
                AwsHook(aws_conn_id=self.aws_conn_id).get_credentials(),
            ) if value is not None
        }
        if self.dataproc_properties is not None:
            properties.update(self.dataproc_properties)

        # fork super().execute to reference combined properties. forked from:
        # https://github.com/apache/airflow/blob/1.10.2/airflow/contrib/operators/dataproc_operator.py#L1181-L1197
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.cluster_name,
                                       "hadoopJob", properties)

        job.set_main(self.main_jar, self.main_class)
        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)
        job.add_file_uris(self.files)
        job.set_job_name(self.job_name)

        job_to_submit = job.build()
        self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"]

        hook.submit(hook.project_id, job_to_submit, self.region,
                    self.job_error_states)
Exemplo n.º 3
0
    def execute(self, context):
        hook = DataProcHook(
            gcp_conn_id=self.google_cloud_conn_id,
            delegate_to=self.delegate_to
        )
        service = hook.get_conn()

        if self._get_cluster(service):
            logging.info('Cluster {} already exists... Checking status...'.format(
                            self.cluster_name
                        ))
            self._wait_for_done(service)
            return True

        cluster_data = self._build_cluster_data()
        try:
            service.projects().regions().clusters().create(
                projectId=self.project_id,
                region=self.region,
                body=cluster_data
            ).execute()
        except HttpError as e:
            # probably two cluster start commands at the same time
            time.sleep(10)
            if self._get_cluster(service):
                logging.info('Cluster {} already exists... Checking status...'.format(
                             self.cluster_name
                             ))
                self._wait_for_done(service)
                return True
            else:
                raise e

        self._wait_for_done(service)
    def execute(self, context):
        self.log.info('Creating cluster: %s', self.cluster_name)
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        service = hook.get_conn()

        if self._get_cluster(service):
            self.log.info('Cluster %s already exists... Checking status...',
                          self.cluster_name)
            self._wait_for_done(service)
            return True

        cluster_data = self._build_cluster_data()
        try:
            service.projects().regions().clusters().create(
                projectId=self.project_id,
                region=self.region,
                body=cluster_data).execute()
        except HttpError as e:
            # probably two cluster start commands at the same time
            time.sleep(10)
            if self._get_cluster(service):
                self.log.info(
                    'Cluster {} already exists... Checking status...',
                    self.cluster_name)
                self._wait_for_done(service)
                return True
            else:
                raise e

        self._wait_for_done(service)
Exemplo n.º 5
0
    def __init__(self,
                 job_name='{{task.task_id}}_{{ds_nodash}}',
                 cluster_name="cluster-1",
                 dataproc_properties=None,
                 dataproc_jars=None,
                 gcp_conn_id='google_cloud_default',
                 delegate_to=None,
                 labels=None,
                 region='global',
                 job_error_states=None,
                 *args,
                 **kwargs):
        super(DataProcJobBaseOperator, self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.labels = labels
        self.job_name = job_name
        self.cluster_name = cluster_name
        self.dataproc_properties = dataproc_properties
        self.dataproc_jars = dataproc_jars
        self.region = region
        self.job_error_states = job_error_states if job_error_states is not None else {
            'ERROR'
        }

        self.hook = DataProcHook(gcp_conn_id=gcp_conn_id,
                                 delegate_to=delegate_to)
        self.job_template = None
        self.job = None
        self.dataproc_job_id = None
Exemplo n.º 6
0
    def __init__(self,
                 job_name: str = '{{task.task_id}}_{{ds_nodash}}',
                 cluster_name: str = "cluster-1",
                 dataproc_properties: Optional[Dict] = None,
                 dataproc_jars: Optional[List[str]] = None,
                 gcp_conn_id: str = 'google_cloud_default',
                 delegate_to: Optional[str] = None,
                 labels: Optional[Dict] = None,
                 region: str = 'global',
                 job_error_states: Optional[Set[str]] = None,
                 *args,
                 **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.labels = labels
        self.job_name = job_name
        self.cluster_name = cluster_name
        self.dataproc_properties = dataproc_properties
        self.dataproc_jars = dataproc_jars
        self.region = region
        self.job_error_states = job_error_states if job_error_states is not None else {
            'ERROR'
        }

        self.hook = DataProcHook(gcp_conn_id=gcp_conn_id,
                                 delegate_to=delegate_to)
        self.job_template = None
        self.job = None
        self.dataproc_job_id = None
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.dataproc_cluster, "pigJob", self.dataproc_properties)

        job.add_query(self.query)
        job.add_variables(self.variables)
        job.add_jar_file_uris(self.dataproc_jars)

        hook.submit(hook.project_id, job.build())
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.dataproc_cluster, "pysparkJob", self.dataproc_properties)

        job.set_python_main(self.main)
        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)

        hook.submit(hook.project_id, job.build())
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.dataproc_cluster,
                                       "pigJob", self.dataproc_properties)

        job.add_query(self.query)
        job.add_variables(self.variables)
        job.add_jar_file_uris(self.dataproc_jars)

        hook.submit(hook.project_id, job.build())
class DataProcHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataproc_hook = DataProcHook()

    @mock.patch(DATAPROC_STRING.format('_DataProcJob'))
    def test_submit(self, job_mock):
      with mock.patch(DATAPROC_STRING.format('DataProcHook.get_conn', return_value=None)):
        self.dataproc_hook.submit(PROJECT_ID, JOB)
        job_mock.assert_called_once_with(mock.ANY, PROJECT_ID, JOB, REGION)
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.dataproc_cluster, "pysparkJob",
                                       self.dataproc_properties)

        job.set_python_main(self.main)
        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)

        hook.submit(hook.project_id, job.build())
    def execute(self, context):
        self.log.info('Deleting cluster: %s', self.cluster_name)
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        service = hook.get_conn()

        response = service.projects().regions().clusters().delete(
            projectId=self.project_id,
            region=self.region,
            clusterName=self.cluster_name).execute()
        operation_name = response['name']
        self.log.info("Cluster delete operation name: %s", operation_name)
        self._wait_for_done(service, operation_name)
class DataProcHookTest(unittest.TestCase):
    def setUp(self):
        with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                        new=mock_init):
            self.dataproc_hook = DataProcHook()

    @mock.patch(DATAPROC_STRING.format('_DataProcJob'))
    def test_submit(self, job_mock):
        with mock.patch(DATAPROC_STRING.format('DataProcHook.get_conn',
                                               return_value=None)):
            self.dataproc_hook.submit(GCP_PROJECT_ID_HOOK_UNIT_TEST, JOB)
            job_mock.assert_called_once_with(mock.ANY, GCP_PROJECT_ID_HOOK_UNIT_TEST, JOB, GCP_REGION,
                                             job_error_states=mock.ANY)
Exemplo n.º 14
0
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.google_cloud_conn_id,
                            delegate_to=self.delegate_to)
        service = hook.get_conn()

        response = service.projects().regions().clusters().delete(
            projectId=self.project_id,
            region=self.region,
            clusterName=self.cluster_name).execute()
        operation_name = response['name']
        logging.info(
            "Cluster delete operation name: {}".format(operation_name))
        self._wait_for_done(service, operation_name)
 def execute(self, context):
     hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to)
     job = hook.create_job_template(self.task_id, self.cluster_name,
                                    "sparkJob",
                                    self.dataproc_properties)
     job.set_main(self.main_jar, self.main_class)
     job.add_args(self.arguments)
     job.add_jar_file_uris(self.dataproc_jars)
     job.add_archive_uris(self.archives)
     job.add_file_uris(self.files)
     job.set_job_name(self.job_name)
     hook.submit(self.project_id, job.build(), self.region)
Exemplo n.º 16
0
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.cluster_name, "hadoopJob",
                                       self.dataproc_properties)

        job.set_main(self.main_jar, self.main_class)
        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)
        job.add_file_uris(self.files)
        job.set_job_name(self.job_name)

        hook.submit(hook.project_id, job.build(), self.region)
Exemplo n.º 17
0
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.cluster_name, "pigJob",
                                       self.dataproc_properties)

        if self.query is None:
            job.add_query_uri(self.query_uri)
        else:
            job.add_query(self.query)
        job.add_variables(self.variables)
        job.add_jar_file_uris(self.dataproc_jars)
        job.set_job_name(self.job_name)

        hook.submit(hook.project_id, job.build(), self.region)
Exemplo n.º 18
0
 def __init__(self,
              project_id,
              region='global',
              gcp_conn_id='google_cloud_default',
              delegate_to=None,
              *args,
              **kwargs):
     super(DataprocOperationBaseOperator, self).__init__(*args, **kwargs)
     self.gcp_conn_id = gcp_conn_id
     self.delegate_to = delegate_to
     self.project_id = project_id
     self.region = region
     self.hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                              delegate_to=self.delegate_to,
                              api_version='v1beta2')
Exemplo n.º 19
0
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)
        job = hook.create_job_template(self.task_id, self.cluster_name, "pigJob",
                                       self.dataproc_properties)

        if self.query is None:
            job.add_query_uri(self.query_uri)
        else:
            job.add_query(self.query)
        job.add_variables(self.variables)
        job.add_jar_file_uris(self.dataproc_jars)
        job.set_job_name(self.job_name)

        hook.submit(hook.project_id, job.build(), self.region)
Exemplo n.º 20
0
    def execute(self, context):
        hook = DataProcHook(
            gcp_conn_id=self.google_cloud_conn_id,
            delegate_to=self.delegate_to
        )
        service = hook.get_conn()

        response = service.projects().regions().clusters().delete(
            projectId=self.project_id,
            region=self.region,
            clusterName=self.cluster_name
        ).execute()
        operation_name = response['name']
        logging.info("Cluster delete operation name: {}".format(operation_name))
        self._wait_for_done(service, operation_name)
Exemplo n.º 21
0
 def __init__(self,
              project_id: str,
              region: str = 'global',
              gcp_conn_id: str = 'google_cloud_default',
              delegate_to: Optional[str] = None,
              *args,
              **kwargs) -> None:
     super().__init__(*args, **kwargs)
     self.gcp_conn_id = gcp_conn_id
     self.delegate_to = delegate_to
     self.project_id = project_id
     self.region = region
     self.hook = DataProcHook(gcp_conn_id=self.gcp_conn_id,
                              delegate_to=self.delegate_to,
                              api_version='v1beta2')
Exemplo n.º 22
0
    def execute(self, context):
        self.log.info('Deleting cluster: %s', self.cluster_name)
        hook = DataProcHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to
        )
        service = hook.get_conn()

        response = service.projects().regions().clusters().delete(
            projectId=self.project_id,
            region=self.region,
            clusterName=self.cluster_name
        ).execute()
        operation_name = response['name']
        self.log.info("Cluster delete operation name: %s", operation_name)
        self._wait_for_done(service, operation_name)
Exemplo n.º 23
0
    def __init__(self, task_run):
        super(DataProcCtrl, self).__init__(task_run=task_run)

        self.dataproc = self.task.spark_engine

        gcp_conn_id = self.task_env.conn_id
        self.cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        self.cluster_info = self.cluster_hook.get_cluster(
            project_id=self.cluster_hook.project_id,
            region=self.dataproc.region,
            cluster_name=self.dataproc.cluster,
        )

        cluster_temp = self.cluster_info.get("config", {}).get("configBucket")
        if cluster_temp:
            self.remote_sync_root = target("gs://%s/dbnd/sync" % cluster_temp)
    def get_dataproc_vars():
        gcp_conn_id = 'google_cloud_default'
        delegate_to = None
        cluster_name = models.Variable.get('dataproc_cluster_name')
        project_id = models.Variable.get('gcp_project')
        region = models.Variable.get('gce_region')

        hook = DataProcHook(gcp_conn_id=gcp_conn_id, delegate_to=delegate_to)

        service = hook.get_conn()

        cluster = _get_cluster(service, cluster_name, region)

        if 'status' in cluster:
            logging.info(cluster['config']['configBucket'])
            logging.info(cluster['config']['workerConfig']['instanceNames'])
        else:
            logging.info('not ready')
 def ensure_cluster_exists():
     cluster = DataProcHook().get_conn().projects().regions().clusters(
     ).get(projectId=Variable.get('project'),
           region=Variable.get('region'),
           clusterName=CLUSTER_NAME).execute(num_retries=5)
     if cluster is None or len(
             cluster) == 0 or 'clusterName' not in cluster:
         return 'create_cluster'
     else:
         return 'run_job'
    def execute(self, context):
        hook = DataProcHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to
        )
        job = hook.create_job_template(
            self.task_id, self.cluster_name, "pysparkJob", self.dataproc_properties)

        #  Check if the file is local, if that is the case, upload it to a bucket
        if os.path.isfile(self.main):
            cluster_info = hook.get_cluster(
                project_id=hook.project_id,
                region=self.region,
                cluster_name=self.cluster_name
            )
            bucket = cluster_info['config']['configBucket']
            self.main = self._upload_file_temp(bucket, self.main)
        job.set_python_main(self.main)

        job.add_args(self.arguments)
        job.add_jar_file_uris(self.dataproc_jars)
        job.add_archive_uris(self.archives)
        job.add_file_uris(self.files)
        job.add_python_file_uris(self.pyfiles)
        job.set_job_name(self.job_name)

        hook.submit(hook.project_id, job.build(), self.region)
Exemplo n.º 27
0
    def terminate_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        dataproc_config = DataprocConfig()

        gcp_conn_id = get_settings().get_env_config(CloudType.gcp).conn_id

        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        delete_cluster = dataproc_operator.DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=dataproc_config.cluster,
            project_id=cluster_hook.project_id,
            gcp_conn_id=gcp_conn_id,
            region=dataproc_config.region,
        )

        return delete_cluster
Exemplo n.º 28
0
    def create_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        from dbnd._core.current import get_settings

        cloud = get_settings().get_env_config(CloudType.gcp)

        gcp_conn_id = cloud.conn_id

        dataproc_config = DataprocConfig()
        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)

        return dataproc_operator.DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            project_id=cluster_hook.project_id,
            cluster_name=dataproc_config.cluster,
            gcp_conn_id=gcp_conn_id,
            num_workers=dataproc_config.num_workers,
            zone=dataproc_config.zone,
            network_uri=dataproc_config.network_uri,
            subnetwork_uri=dataproc_config.subnetwork_uri,
            tags=dataproc_config.tags,
            storage_bucket=dataproc_config.storage_bucket,
            init_actions_uris=dataproc_config.init_actions_uris,
            init_action_timeout=dataproc_config.init_action_timeout,
            metadata=dataproc_config.metadata,
            image_version=dataproc_config.image_version,
            properties=dataproc_config.properties,
            master_machine_type=dataproc_config.master_machine_type,
            master_disk_size=dataproc_config.master_disk_size,
            worker_machine_type=dataproc_config.worker_machine_type,
            worker_disk_size=dataproc_config.worker_disk_size,
            num_preemptible_workers=dataproc_config.num_preemptible_workers,
            labels=dataproc_config.labels,
            delegate_to=dataproc_config.delegate_to,
            service_account=dataproc_config.service_account,
            service_account_scopes=dataproc_config.service_account_scopes,
            idle_delete_ttl=dataproc_config.idle_delete_ttl,
            auto_delete_time=dataproc_config.auto_delete_time,
            auto_delete_ttl=dataproc_config.auto_delete_ttl,
        )
 def setUp(self):
     with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'),
                     new=mock_init):
         self.dataproc_hook = DataProcHook()
    def execute(self, context):
        hook = DataProcHook(gcp_conn_id=self.google_cloud_conn_id,
                            delegate_to=self.delegate_to)
        service = hook.get_conn()

        if self._get_cluster(service):
            logging.info(
                'Cluster {} already exists... Checking status...'.format(
                    self.cluster_name))
            self._wait_for_done(service)
            return True

        zone_uri = \
            'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
                self.project_id, self.zone
            )
        master_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.master_machine_type
            )
        worker_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.worker_machine_type
            )
        cluster_data = {
            'projectId': self.project_id,
            'clusterName': self.cluster_name,
            'config': {
                'gceClusterConfig': {
                    'zoneUri': zone_uri
                },
                'masterConfig': {
                    'numInstances': 1,
                    'machineTypeUri': master_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.master_disk_size
                    }
                },
                'workerConfig': {
                    'numInstances': self.num_workers,
                    'machineTypeUri': worker_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.worker_disk_size
                    }
                },
                'secondaryWorkerConfig': {},
                'softwareConfig': {}
            }
        }
        if self.num_preemptible_workers > 0:
            cluster_data['config']['secondaryWorkerConfig'] = {
                'numInstances': self.num_preemptible_workers,
                'machineTypeUri': worker_type_uri,
                'diskConfig': {
                    'bootDiskSizeGb': self.worker_disk_size
                },
                'isPreemptible': True
            }
        if self.labels:
            cluster_data['labels'] = self.labels
        if self.storage_bucket:
            cluster_data['config']['configBucket'] = self.storage_bucket
        if self.metadata:
            cluster_data['config']['gceClusterConfig'][
                'metadata'] = self.metadata
        if self.properties:
            cluster_data['config']['softwareConfig'][
                'properties'] = self.properties
        if self.init_actions_uris:
            init_actions_dict = [{
                'executableFile': uri
            } for uri in self.init_actions_uris]
            cluster_data['config']['initializationActions'] = init_actions_dict

        try:
            service.projects().regions().clusters().create(
                projectId=self.project_id,
                region=self.region,
                body=cluster_data).execute()
        except HttpError as e:
            # probably two cluster start commands at the same time
            time.sleep(10)
            if self._get_cluster(service):
                logging.info(
                    'Cluster {} already exists... Checking status...'.format(
                        self.cluster_name))
                self._wait_for_done(service)
                return True
            else:
                raise e

        self._wait_for_done(service)
Exemplo n.º 31
0
    def execute(self, context):
        hook = DataProcHook(
            gcp_conn_id=self.google_cloud_conn_id,
            delegate_to=self.delegate_to
        )
        service = hook.get_conn()

        if self._get_cluster(service):
            logging.info('Cluster {} already exists... Checking status...'.format(
                            self.cluster_name
                        ))
            self._wait_for_done(service)
            return True

        zone_uri = \
            'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
                self.project_id, self.zone
            )
        master_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.master_machine_type
            )
        worker_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.worker_machine_type
            )
        cluster_data = {
            'projectId': self.project_id,
            'clusterName': self.cluster_name,
            'config': {
                'gceClusterConfig': {
                    'zoneUri': zone_uri
                },
                'masterConfig': {
                    'numInstances': 1,
                    'machineTypeUri': master_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.master_disk_size
                    }
                },
                'workerConfig': {
                    'numInstances': self.num_workers,
                    'machineTypeUri': worker_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.worker_disk_size
                    }
                },
                'secondaryWorkerConfig': {},
                'softwareConfig': {}
            }
        }
        if self.num_preemptible_workers > 0:
            cluster_data['config']['secondaryWorkerConfig'] = {
                'numInstances': self.num_preemptible_workers,
                'machineTypeUri': worker_type_uri,
                'diskConfig': {
                    'bootDiskSizeGb': self.worker_disk_size
                },
                'isPreemptible': True
            }
        if self.labels:
            cluster_data['labels'] = self.labels
        if self.storage_bucket:
            cluster_data['config']['configBucket'] = self.storage_bucket
        if self.metadata:
            cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata
        if self.properties:
            cluster_data['config']['softwareConfig']['properties'] = self.properties
        if self.init_actions_uris:
            init_actions_dict = [
                {'executableFile': uri} for uri in self.init_actions_uris
            ]
            cluster_data['config']['initializationActions'] = init_actions_dict

        try:
            service.projects().regions().clusters().create(
                projectId=self.project_id,
                region=self.region,
                body=cluster_data
            ).execute()
        except HttpError as e:
            # probably two cluster start commands at the same time
            time.sleep(10)
            if self._get_cluster(service):
                logging.info('Cluster {} already exists... Checking status...'.format(
                             self.cluster_name
                             ))
                self._wait_for_done(service)
                return True
            else:
                raise e

        self._wait_for_done(service)
Exemplo n.º 32
0
class DataProcCtrl(SparkCtrl):
    def __init__(self, task_run):
        super(DataProcCtrl, self).__init__(task_run=task_run)

        self.dataproc = self.task.dataproc

        gcp_conn_id = self.task_env.conn_id
        self.cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        self.cluster_info = self.cluster_hook.get_cluster(
            project_id=self.cluster_hook.project_id,
            region=self.dataproc.region,
            cluster_name=self.dataproc.cluster,
        )
        self.storage = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=gcp_conn_id)

        cluster_temp = self.cluster_info.get("config", {}).get("configBucket")
        if cluster_temp:
            self.remote_sync_root = target("gs://%s/dbnd/sync" % cluster_temp)

    def _get_job_builder(self, job_type):
        job_builder = self.cluster_hook.create_job_template(
            self.task.task_id,
            self.dataproc.cluster,
            job_type=job_type,
            properties=self.config.conf,
        )
        # we will have "unique" job name by set_job_name
        job_builder.set_job_name(self.job.job_name)
        job_builder.add_args(list_of_strings(self.task.application_args()))
        job_builder.add_file_uris(self.deploy.sync_files(self.config.files))
        return job_builder

    def _run_job_builder(self, job_builder):
        self.cluster_hook.submit(self.cluster_hook.project_id,
                                 job_builder.build(), self.dataproc.region)

    def run_spark(self, main_class):
        job_builder = self._get_job_builder(job_type="sparkJob")
        jars = list(self.config.jars)
        # we expect SparkTask to behave like spark_submit api i.e.
        # main_jar is a jar to run, main_class is needed only if jar has no default main.
        # dataproc expects main_jar to have default main, so when both main_jar and main_class are set we
        # need to move main_jar to jars.
        if self.task.main_class:
            jars.append(self.config.main_jar)
            job_builder.set_main(None, self.task.main_class)
        else:
            job_builder.set_main(self.deploy.sync(self.config.main_jar), None)

        job_builder.add_jar_file_uris(self.deploy.sync_files(jars))

        return self._run_job_builder(job_builder)

    def run_pyspark(self, pyspark_script):
        job_builder = self._get_job_builder(job_type="pysparkJob")
        jars = list(self.config.jars)

        if self.config.main_jar:
            jars.append(self.config.main_jar)

        job_builder.add_jar_file_uris(self.deploy.sync_files(jars))
        job_builder.set_python_main(self.deploy.sync(pyspark_script))

        return self._run_job_builder(job_builder)

    @classmethod
    def create_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        from dbnd._core.current import get_settings

        cloud = get_settings().get_env_config(CloudType.gcp)

        gcp_conn_id = cloud.conn_id

        dataproc_config = DataprocConfig()
        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)

        return dataproc_operator.DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            project_id=cluster_hook.project_id,
            cluster_name=dataproc_config.cluster,
            gcp_conn_id=gcp_conn_id,
            num_workers=dataproc_config.num_workers,
            zone=dataproc_config.zone,
            network_uri=dataproc_config.network_uri,
            subnetwork_uri=dataproc_config.subnetwork_uri,
            tags=dataproc_config.tags,
            storage_bucket=dataproc_config.storage_bucket,
            init_actions_uris=dataproc_config.init_actions_uris,
            init_action_timeout=dataproc_config.init_action_timeout,
            metadata=dataproc_config.metadata,
            image_version=dataproc_config.image_version,
            properties=dataproc_config.properties,
            master_machine_type=dataproc_config.master_machine_type,
            master_disk_size=dataproc_config.master_disk_size,
            worker_machine_type=dataproc_config.worker_machine_type,
            worker_disk_size=dataproc_config.worker_disk_size,
            num_preemptible_workers=dataproc_config.num_preemptible_workers,
            labels=dataproc_config.labels,
            delegate_to=dataproc_config.delegate_to,
            service_account=dataproc_config.service_account,
            service_account_scopes=dataproc_config.service_account_scopes,
            idle_delete_ttl=dataproc_config.idle_delete_ttl,
            auto_delete_time=dataproc_config.auto_delete_time,
            auto_delete_ttl=dataproc_config.auto_delete_ttl,
        )

    @classmethod
    def terminate_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        dataproc_config = DataprocConfig()

        gcp_conn_id = get_settings().get_env_config(CloudType.gcp).conn_id

        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        delete_cluster = dataproc_operator.DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=dataproc_config.cluster,
            project_id=cluster_hook.project_id,
            gcp_conn_id=gcp_conn_id,
            region=dataproc_config.region,
        )

        return delete_cluster

    @classmethod
    def get_engine_policy(cls):
        return DataprocConfig().policy
Exemplo n.º 33
0
class DataProcJobBaseOperator(BaseOperator):
    """
    The base class for operators that launch job on DataProc.

    :param job_name: The job name used in the DataProc cluster. This name by default
        is the task_id appended with the execution data, but can be templated. The
        name will always be appended with a random number to avoid name clashes.
    :type job_name: str
    :param cluster_name: The name of the DataProc cluster.
    :type cluster_name: str
    :param dataproc_properties: Map for the Hive properties. Ideal to put in
        default arguments (templated)
    :type dataproc_properties: dict
    :param dataproc_jars: HCFS URIs of jar files to add to the CLASSPATH of the Hive server and Hadoop
        MapReduce (MR) tasks. Can contain Hive SerDes and UDFs. (templated)
    :type dataproc_jars: list
    :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform.
    :type gcp_conn_id: str
    :param delegate_to: The account to impersonate, if any.
        For this to work, the service account making the request must have domain-wide
        delegation enabled.
    :type delegate_to: str
    :param labels: The labels to associate with this job. Label keys must contain 1 to 63 characters,
        and must conform to RFC 1035. Label values may be empty, but, if present, must contain 1 to 63
        characters, and must conform to RFC 1035. No more than 32 labels can be associated with a job.
    :type labels: dict
    :param region: The specified region where the dataproc cluster is created.
    :type region: str
    :param job_error_states: Job states that should be considered error states.
        Any states in this set will result in an error being raised and failure of the
        task. Eg, if the ``CANCELLED`` state should also be considered a task failure,
        pass in ``{'ERROR', 'CANCELLED'}``. Possible values are currently only
        ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to
        ``{'ERROR'}``.
    :type job_error_states: set
    :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API.
        This is useful for identifying or linking to the job in the Google Cloud Console
        Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with
        an 8 character random string.
    :vartype dataproc_job_id: str
    """
    job_type = ""

    @apply_defaults
    def __init__(self,
                 job_name='{{task.task_id}}_{{ds_nodash}}',
                 cluster_name="cluster-1",
                 dataproc_properties=None,
                 dataproc_jars=None,
                 gcp_conn_id='google_cloud_default',
                 delegate_to=None,
                 labels=None,
                 region='global',
                 job_error_states=None,
                 *args,
                 **kwargs):
        super(DataProcJobBaseOperator, self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.labels = labels
        self.job_name = job_name
        self.cluster_name = cluster_name
        self.dataproc_properties = dataproc_properties
        self.dataproc_jars = dataproc_jars
        self.region = region
        self.job_error_states = job_error_states if job_error_states is not None else {
            'ERROR'
        }

        self.hook = DataProcHook(gcp_conn_id=gcp_conn_id,
                                 delegate_to=delegate_to)
        self.job_template = None
        self.job = None
        self.dataproc_job_id = None

    def create_job_template(self):
        """
        Initialize `self.job_template` with default values
        """
        self.job_template = self.hook.create_job_template(
            self.task_id, self.cluster_name, self.job_type,
            self.dataproc_properties)
        self.job_template.set_job_name(self.job_name)
        self.job_template.add_jar_file_uris(self.dataproc_jars)
        self.job_template.add_labels(self.labels)

    def execute(self, context):
        """
        Build `self.job` based on the job template, and submit it.
        :raises AirflowException if no template has been initialized (see create_job_template)
        """
        if self.job_template:
            self.job = self.job_template.build()
            self.dataproc_job_id = self.job["job"]["reference"]["jobId"]
            self.hook.submit(self.hook.project_id, self.job, self.region,
                             self.job_error_states)
        else:
            raise AirflowException("Create a job template before")

    def on_kill(self):
        """
        Callback called when the operator is killed.
        Cancel any running job.
        """
        if self.dataproc_job_id:
            self.hook.cancel(self.hook.project_id, self.dataproc_job_id,
                             self.region)