Exemplo n.º 1
0
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
def run_export(project_id: str, dry_run: bool, state_code: str,
               target_bucket_suffix: str):
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist')

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table, [state_code])
        logging.info(export_query)

        if not export_query:
            continue

        target_bucket_name = f'{project_id}-{target_bucket_suffix}'
        export_dir = gcs_export_directory(target_bucket_name, today,
                                          state_code)
        export_file_name = f'{table.table_id}_{today.isoformat()}_export.csv'
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id='export_temporary_tables',
            intermediate_table_name=
            f'{dataset_ref.dataset_id}_{table.table_id}',
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config)
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(export_configs)
Exemplo n.º 3
0
    def _copy_to_ingest_bucket(self, path: str,
                               normalized_file_name: str) -> None:
        full_file_upload_path_uri = GcsfsFilePath.from_directory_and_file_name(
            self.ingest_bucket, normalized_file_name).uri()

        if not self.dry_run:
            gsutil_cp(path, full_file_upload_path_uri)

        with self.mutex:
            self.copies_list.append((path, full_file_upload_path_uri))
            if self.move_progress:
                self.move_progress.next()
Exemplo n.º 4
0
    def _generate_output_path(self,
                              ingest_view_export_args: GcsfsIngestViewExportArgs,
                              metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath:
        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        if not metadata.normalized_file_name:
            output_file_name = to_normalized_unprocessed_file_name(
                f'{ingest_view.file_tag}.csv',
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=ingest_view_export_args.upper_bound_datetime_to_export
            )
        else:
            output_file_name = metadata.normalized_file_name

        return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)
Exemplo n.º 5
0
    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        self._add_path(path)
    def _create_split_file_path(self, original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        existing_suffix = \
            f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        updated_file_name = (
            f'{parts.file_tag}{existing_suffix}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}'
            f'.{parts.extension}')
        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                dt=parts.utc_upload_datetime))
    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        if src_path.abs_path() in self.uploaded_test_path_to_actual:
            self.uploaded_test_path_to_actual[dst_path.abs_path()] = \
                self.uploaded_test_path_to_actual[src_path.abs_path()]

        self._add_path(path)
    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.get_bucket(src_path.bucket_name)
        src_blob = src_bucket.get_blob(src_path.blob_name)
        if not src_blob:
            raise ValueError(
                f'Blob at path [{src_path.abs_path()}] does not exist')
        dst_bucket = self.storage_client.get_bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = \
                GcsfsFilePath.from_directory_and_file_name(
                    dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)
    def _create_split_file_path(self, original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f'{parts.stripped_file_name}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}'
            f'.{parts.extension}')

        file_type = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime))
    def get_output_path(self, chunk_num: int):
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.temp_output_directory_path, f'temp_{name}_{chunk_num}.csv')