Exemplos de GcsfsFilePath.from_directory_and_file_name em Python, exemplos de recidiviz.ingest.direct.controllers.gcsfs_path.GcsfsFilePath.from_directory_and_file_name em Python

Exemplo n.º 1

0

Exibir arquivo

    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: state_table_export_to_csv.py Projeto: pnchbck/pulse-data

def run_export(project_id: str, dry_run: bool, state_code: str,
               target_bucket_suffix: str):
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist')

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table, [state_code])
        logging.info(export_query)

        if not export_query:
            continue

        target_bucket_name = f'{project_id}-{target_bucket_suffix}'
        export_dir = gcs_export_directory(target_bucket_name, today,
                                          state_code)
        export_file_name = f'{table.table_id}_{today.isoformat()}_export.csv'
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id='export_temporary_tables',
            intermediate_table_name=
            f'{dataset_ref.dataset_id}_{table.table_id}',
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config)
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(export_configs)

Exemplo n.º 3

0

Exibir arquivo

    def _copy_to_ingest_bucket(self, path: str,
                               normalized_file_name: str) -> None:
        full_file_upload_path_uri = GcsfsFilePath.from_directory_and_file_name(
            self.ingest_bucket, normalized_file_name).uri()

        if not self.dry_run:
            gsutil_cp(path, full_file_upload_path_uri)

        with self.mutex:
            self.copies_list.append((path, full_file_upload_path_uri))
            if self.move_progress:
                self.move_progress.next()

Exemplo n.º 4

0

Exibir arquivo

    def _generate_output_path(self,
                              ingest_view_export_args: GcsfsIngestViewExportArgs,
                              metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath:
        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        if not metadata.normalized_file_name:
            output_file_name = to_normalized_unprocessed_file_name(
                f'{ingest_view.file_tag}.csv',
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=ingest_view_export_args.upper_bound_datetime_to_export
            )
        else:
            output_file_name = metadata.normalized_file_name

        return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)

Exemplo n.º 5

0

Exibir arquivo

    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        self._add_path(path)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: gcsfs_direct_ingest_controller.py Projeto: xgenie-007/pulse-data

    def _create_split_file_path(self, original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        existing_suffix = \
            f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        updated_file_name = (
            f'{parts.file_tag}{existing_suffix}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}'
            f'.{parts.extension}')
        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                dt=parts.utc_upload_datetime))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: fake_direct_ingest_gcs_file_system.py Projeto: pnchbck/pulse-data

    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        if src_path.abs_path() in self.uploaded_test_path_to_actual:
            self.uploaded_test_path_to_actual[dst_path.abs_path()] = \
                self.uploaded_test_path_to_actual[src_path.abs_path()]

        self._add_path(path)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: direct_ingest_gcs_file_system.py Projeto: xgenie-007/pulse-data

    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.get_bucket(src_path.bucket_name)
        src_blob = src_bucket.get_blob(src_path.blob_name)
        if not src_blob:
            raise ValueError(
                f'Blob at path [{src_path.abs_path()}] does not exist')
        dst_bucket = self.storage_client.get_bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = \
                GcsfsFilePath.from_directory_and_file_name(
                    dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: gcsfs_direct_ingest_controller.py Projeto: nikhil-tibrewal/pulse-data

    def _create_split_file_path(self, original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f'{parts.stripped_file_name}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}'
            f'.{parts.extension}')

        file_type = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: direct_ingest_raw_file_import_manager.py Projeto: pnchbck/pulse-data

    def get_output_path(self, chunk_num: int):
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.temp_output_directory_path, f'temp_{name}_{chunk_num}.csv')