def download_to_temp_file(self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]: """Downloads file contents into local temporary_file, returning path to temp file, or None if the path no-longer exists in the GCS file system. """ if not self.exists(path): return None return GcsfsFileContentsHandle(self.real_absolute_path_for_path(path))
def upload_raw_file_to_gcs(fs: GCSFileSystem, local_filepath: str, bucket_name: str) -> None: """Upload raw Sendgrid CSV to GCS""" fs.upload_from_contents_handle_stream( path=GcsfsFilePath( bucket_name=bucket_name, blob_name=date.today().strftime(DATE_FORMAT), ), contents_handle=GcsfsFileContentsHandle(local_file_path=local_filepath, cleanup_file=False), content_type="text/csv", ) logging.info( "Uploaded file [%s] to Google Cloud Storage bucket name=[%s] blob name=[%s]", local_filepath, bucket_name, date.today().strftime(DATE_FORMAT), )
def _parse(self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> IngestInfo: file_tag = self.file_tag(args.file_path) gating_context = IngestGatingContext( file_tag=file_tag, ingest_instance=self.ingest_instance) if file_tag not in self.get_file_tag_rank_list(): raise DirectIngestError( msg=f"No mapping found for tag [{file_tag}]", error_type=DirectIngestErrorType.INPUT_ERROR, ) file_mapping = self._yaml_filepath(file_tag) row_pre_processors = self._get_row_pre_processors_for_file(file_tag) row_post_processors = self._get_row_post_processors_for_file(file_tag) file_post_processors = self._get_file_post_processors_for_file( file_tag) # pylint: disable=assignment-from-none primary_key_override_callback = self._get_primary_key_override_for_file( file_tag) # pylint: disable=assignment-from-none ancestor_chain_overrides_callback = ( self._get_ancestor_chain_overrides_callback_for_file(file_tag)) should_set_with_empty_values = ( gating_context.file_tag in self._get_files_to_set_with_empty_values()) data_extractor = CsvDataExtractor( file_mapping, gating_context, row_pre_processors, row_post_processors, file_post_processors, ancestor_chain_overrides_callback, primary_key_override_callback, self.system_level, should_set_with_empty_values, ) return data_extractor.extract_and_populate_data( contents_handle.get_contents_iterator())
def _get_file_to_upload( path: GcsfsFilePath, fs: GCSFileSystem, url: str, pdf_name: str, always_download: bool, post_data: Dict, verify_ssl: bool, ) -> Optional[GcsfsFileContentsHandle]: """This function checks first whether it needs to download, and then returns the locally downloaded pdf""" # If it already exists in GCS, return. if fs.exists(path) and not always_download: return None if post_data: response = requests.post(url, data=post_data, verify=verify_ssl) else: response = requests.get(url, verify=verify_ssl) if response.status_code == 200: # This is a PDF so use content to get the bytes directly. return GcsfsFileContentsHandle.from_bytes(response.content) raise ScrapeAggregateError("Could not download file {}".format(pdf_name))