def download_to_temp_file(self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        """Downloads file contents into local temporary_file, returning path to
        temp file, or None if the path no-longer exists in the GCS file system.
        """
        if not self.exists(path):
            return None

        return GcsfsFileContentsHandle(self.real_absolute_path_for_path(path))
def upload_raw_file_to_gcs(fs: GCSFileSystem, local_filepath: str,
                           bucket_name: str) -> None:
    """Upload raw Sendgrid CSV to GCS"""

    fs.upload_from_contents_handle_stream(
        path=GcsfsFilePath(
            bucket_name=bucket_name,
            blob_name=date.today().strftime(DATE_FORMAT),
        ),
        contents_handle=GcsfsFileContentsHandle(local_file_path=local_filepath,
                                                cleanup_file=False),
        content_type="text/csv",
    )
    logging.info(
        "Uploaded file [%s] to Google Cloud Storage bucket name=[%s] blob name=[%s]",
        local_filepath,
        bucket_name,
        date.today().strftime(DATE_FORMAT),
    )
示例#3
0
    def _parse(self, args: GcsfsIngestArgs,
               contents_handle: GcsfsFileContentsHandle) -> IngestInfo:
        file_tag = self.file_tag(args.file_path)
        gating_context = IngestGatingContext(
            file_tag=file_tag, ingest_instance=self.ingest_instance)

        if file_tag not in self.get_file_tag_rank_list():
            raise DirectIngestError(
                msg=f"No mapping found for tag [{file_tag}]",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        file_mapping = self._yaml_filepath(file_tag)

        row_pre_processors = self._get_row_pre_processors_for_file(file_tag)
        row_post_processors = self._get_row_post_processors_for_file(file_tag)
        file_post_processors = self._get_file_post_processors_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        primary_key_override_callback = self._get_primary_key_override_for_file(
            file_tag)
        # pylint: disable=assignment-from-none
        ancestor_chain_overrides_callback = (
            self._get_ancestor_chain_overrides_callback_for_file(file_tag))
        should_set_with_empty_values = (
            gating_context.file_tag
            in self._get_files_to_set_with_empty_values())

        data_extractor = CsvDataExtractor(
            file_mapping,
            gating_context,
            row_pre_processors,
            row_post_processors,
            file_post_processors,
            ancestor_chain_overrides_callback,
            primary_key_override_callback,
            self.system_level,
            should_set_with_empty_values,
        )

        return data_extractor.extract_and_populate_data(
            contents_handle.get_contents_iterator())
def _get_file_to_upload(
    path: GcsfsFilePath,
    fs: GCSFileSystem,
    url: str,
    pdf_name: str,
    always_download: bool,
    post_data: Dict,
    verify_ssl: bool,
) -> Optional[GcsfsFileContentsHandle]:
    """This function checks first whether it needs to download, and then
    returns the locally downloaded pdf"""
    # If it already exists in GCS, return.
    if fs.exists(path) and not always_download:
        return None

    if post_data:
        response = requests.post(url, data=post_data, verify=verify_ssl)
    else:
        response = requests.get(url, verify=verify_ssl)
    if response.status_code == 200:
        # This is a PDF so use content to get the bytes directly.
        return GcsfsFileContentsHandle.from_bytes(response.content)

    raise ScrapeAggregateError("Could not download file {}".format(pdf_name))