def _copy_to_ingest_bucket( self, path: str, full_file_upload_path: GcsfsFilePath, ) -> None: """Moves a file within GCS to the appropriate bucket if it has not already been deemed processed or discovered by the file metadata manager. We check both processed and discovered because a file may be discovered and awaiting to be ingested, so we will not re-upload. We check processed because a file may have already been ingested, but has been deleted from the bucket.""" if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered( full_file_upload_path ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed( full_file_upload_path): try: mimetype, _ = guess_type(os.path.basename(path)) self.gcsfs.mv( src_path=GcsfsFilePath.from_absolute_path(path), dst_path=full_file_upload_path, ) self.gcsfs.set_content_type( full_file_upload_path, mimetype if mimetype else "text/plain") logging.info("Copied %s -> %s", path, full_file_upload_path.uri()) self.uploaded_files.append(path) except BaseException as e: logging.warning( "Could not copy %s -> %s due to error %s", path, full_file_upload_path.uri(), e.args, ) self.unable_to_upload_files.append(path) else: logging.info( "Skipping %s -> %s, due to %s already being processed", path, full_file_upload_path.uri(), full_file_upload_path.uri(), ) self.skipped_files.append(path)
def _copy_to_ingest_bucket( self, path: str, full_file_upload_path: GcsfsFilePath, ) -> None: if not self.dry_run: try: gsutil_cp(path, full_file_upload_path.uri()) self.uploaded_files.append(path) self.copies_list.append((path, full_file_upload_path.uri())) except ValueError: self.unable_to_upload_files.append(path) else: self.copies_list.append((path, full_file_upload_path.uri())) with self.mutex: if self.move_progress: # pylint: disable=not-callable self.move_progress.next()
def _copy_to_ingest_bucket(self, path: str, full_file_upload_path: GcsfsFilePath) -> None: try: mimetype, _ = guess_type(os.path.basename(path)) self.gcsfs.mv( src_path=GcsfsFilePath.from_absolute_path(path), dst_path=full_file_upload_path, ) self.gcsfs.set_content_type(full_file_upload_path, mimetype if mimetype else "text/plain") logging.info("Copied %s -> %s", path, full_file_upload_path.uri()) self.uploaded_files.append(path) except BaseException as e: logging.warning( "Could not copy %s -> %s due to error %s", path, full_file_upload_path.uri(), e.args, ) self.unable_to_upload_files.append(path)
def _file_pointer_for_path(self, path: GcsfsFilePath, encoding: str): """Returns a file pointer for the given path.""" # From the GCSFileSystem docs (https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem), # 'google_default' means we should look for local credentials set up via `gcloud login`. The project this is # reading from may have to match the project default you have set locally (check via `gcloud info` and set via # `gcloud config set project [PROJECT_ID]`. If we are running in the GAE environment, we should be able to query # the internal metadata for credentials. token = 'google_default' if not environment.in_gae() else 'cloud' return self.gcs_file_system.open(path.uri(), encoding=encoding, token=token)
def open( self, path: GcsfsFilePath, chunk_size: Optional[int] = None, encoding: Optional[str] = None, ) -> Iterator[TextIO]: blob = self._get_blob(path) with blob.open("rb", chunk_size=chunk_size) as f: verifiable_reader = VerifiableBytesReader(f, name=path.uri()) try: yield TextIOWrapper(buffer=verifiable_reader, encoding=encoding) finally: verifiable_reader.verify_crc32c(blob.crc32c)
def _get_blob(self, path: GcsfsFilePath) -> storage.Blob: try: bucket = self.storage_client.bucket(path.bucket_name) blob = bucket.get_blob(path.blob_name) except NotFound as error: logging.warning( "Blob at [%s] does not exist - might have already been deleted", path.uri(), ) raise GCSBlobDoesNotExistError( f"Blob at [{path.uri()}] does not exist") from error else: if not blob: logging.warning( "Blob at [%s] does not exist - might have already been deleted", path.uri(), ) raise GCSBlobDoesNotExistError( f"Blob at [{path.uri()}] does not exist") return blob
def cache_ingest_file(self, path: GcsfsFilePath, csv_text: str, separator: str = ",") -> None: self.fs.upload_from_string(path, csv_text, content_type="text/csv") response = self.test_client.post( "/data_discovery/cache_ingest_file_as_parquet_task", json={ "gcs_file_uri": path.uri(), "file_encoding": "UTF-8", "file_separator": separator, "file_quoting": csv.QUOTE_MINIMAL, }, ) self.assertEqual(HTTPStatus.CREATED, response.status_code)
def build_cache_ingest_file_as_parquet_task( gcs_file: GcsfsFilePath, separator: str, encoding: str, quoting: int, custom_line_terminator: Optional[str], ) -> Dict[str, Any]: body = { "gcs_file_uri": gcs_file.uri(), "file_separator": separator, "file_encoding": encoding, "file_quoting": quoting, "file_custom_line_terminator": custom_line_terminator, } if custom_line_terminator: body["file_custom_line_terminator"] = custom_line_terminator return { "relative_uri": "/admin/data_discovery/cache_ingest_file_as_parquet_task", "body": body, }