Пример #1
0
def should_run():
    bucket = CloudStorage.factory(project_id).get_bucket(bucket_name)
    blobs = CloudStorage.factory(project_id).list_blobs(bucket)
    files = list(map(lambda b: b.name, blobs))
    if isinstance(files, list) and any('.csv' in file for file in files):
        return 'schedule_df_wrench_to_lake'
    else:
        return 'finish'
Пример #2
0
    def process(self, unused_el):
        if isinstance(self._files_startwith, ValueProvider):
            self._files_startwith = self._files_startwith.get()
        if isinstance(self._files_ext, ValueProvider):
            self._files_ext = self._files_ext.get()
        if isinstance(self._sort_key, ValueProvider):
            self._sort_key = self._sort_key.get()
        if isinstance(self._env, ValueProvider):
            self._env = self._env.get()
        if isinstance(self._bucket, ValueProvider):
            self._bucket = self._bucket.get()

        project_id = GCLOUD.project(self._env)
        blobs = CloudStorage.factory(project_id).list_blobs(
            self._bucket, self._files_startwith)
        # Keep only files at the root bucket
        paths = [
            f'gs://{b.bucket.name}/{b.name}' for b in blobs
            if '/' not in b.name and self._files_ext in b.name
        ]
        if isinstance(self._sort_key, str):
            self._sort_key = dill.loads(bytes.fromhex(self._sort_key))
        paths.sort(key=self._sort_key) if len(paths) > 1 else None
        for file in paths:
            yield file
Пример #3
0
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='0 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')
        storage = CloudStorage.factory(project_id)
        cdc_imports_bucket = storage.get_bucket(bucket)
        cdc_imports_processed_bucket = storage.get_bucket(processed_bucket)

        for files_startwith, table in table_map.items():
            pusher_task_id = f'schedule_df_gcs_to_lake_{table}'
            continue_if_file_task = BranchPythonOperator(
                task_id=f'continue_if_file_{files_startwith}',
                python_callable=should_continue,
                op_args=[files_startwith, cdc_imports_bucket, table])
            schedule_df_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=project_id,
                template_name=f'load_cdc_from_gcs_to_lake',
                job_name=f'gcs-to-lake-{table}',
                job_parameters={
                    'files_startwith': files_startwith,
                    'dest': f'{project_id}:lake.{table}'
                },
                provide_context=True)
            monitor_df_job_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                pusher_task_id=pusher_task_id,
                poke_interval=airflow_vars['dags']['cdc_from_gcs_to_lake']
                ['poke_interval'],
                timeout=airflow_vars['dags']['cdc_from_gcs_to_lake']
                ['poke_timeout'],
                dag=dag)
            move_files_task = PythonOperator(
                task_id=f'move_processed_files_{files_startwith}',
                python_callable=storage.move_files,
                op_args=[
                    files_startwith, cdc_imports_bucket,
                    cdc_imports_processed_bucket
                ],
            )
            (start_task >> continue_if_file_task >> schedule_df_task >>
             monitor_df_job_task >> move_files_task >> finish_task)
    return dag
Пример #4
0
def mv_to_s3(gcp_bucket, table, aws_access_key_id, aws_secret_access_key,
             aws_s3_bucket):
    def _is_cloud_storage_dir(object_name):
        return object_name.endswith('/')

    aws_s3_client = boto3.client('s3',
                                 aws_access_key_id=aws_access_key_id,
                                 aws_secret_access_key=aws_secret_access_key)

    # This directory should be removed once the operation is complete, because of GDPR. Stu M. 11/29/19
    with tempdir() as tmp:
        bucket = CloudStorage.factory(project).get_bucket(gcp_bucket)
        blobs = bucket.list_blobs(prefix=table)
        bucket_dirs_marked_for_deletion = []
        for blob in blobs:
            key = blob.name
            file_or_dir = '{}/{}'.format(tmp, key)
            if _is_cloud_storage_dir(key):
                os.mkdir(file_or_dir)
                bucket_dirs_marked_for_deletion.append(key)
            else:
                splits = key.split('/')
                splits[len(splits) -
                       1] = str(uuid.uuid4()) + '-' + splits[len(splits) - 1]
                key = '/'.join(splits)
                dirname = os.path.dirname(file_or_dir)
                if not os.path.isdir(dirname):
                    os.mkdir(dirname)
                blob.download_to_filename(file_or_dir)
                aws_s3_client.upload_file(file_or_dir, aws_s3_bucket, key)
                blob.delete()

        # Cleanup here because, folders in gcs are not deleted. Stu. M. 2/29/20
        for key in bucket_dirs_marked_for_deletion:
            blob = bucket.blob(key)
            blob.delete()
Пример #5
0
def should_continue(prefix=None, bucket=None, table=None):
    if CloudStorage.factory(project_id).has_file(bucket=bucket, prefix=prefix):
        return f'schedule_df_gcs_to_lake_{table}'
    else:
        return 'finish'
Пример #6
0
def list_blobs(bucket, files_startswith):
    return CloudStorage.factory(project_id).list_blobs(bucket,
                                                       files_startswith)
Пример #7
0
def delete_db_import_file():
    CloudStorage.factory(project_id).delete_blob(bucket, f'{import_file_name}')
Пример #8
0
def should_run():
    exist = CloudStorage.factory(project_id).blob_exists(bucket, f'{import_file_name}')
    if exist:
        return 'start_sql_instance'
    else:
        return 'finish'
Пример #9
0
def move_files():
    bucket = CloudStorage.factory(project_id).get_bucket(bucket_name)
    blobs = CloudStorage.factory(project_id).list_blobs(bucket)
    for b in blobs:
        b.bucket.rename_blob(b, f'{processed_file_dir}/{b.name}')
Пример #10
0
 def execute(self, context):
     state = CloudStorage.factory(self.project_id).blob_exists(
         self.bucket, self.file_name)
     return state
Пример #11
0
def clear_gcs_bucket_by_table(env, table):
    bucket = CloudStorage.factory(project).get_bucket(gcs_bucket)
    blobs = bucket.list_blobs(prefix=table)
    for blob in blobs:
        blob.delete()