def _process_pgp_file(self, item): raw_filepath = f"{self.working_dir}/raw/{item}" processed_file = get_processed_name(item) processed_filepath = f"{self.working_dir}/processed/{processed_file}" self.pgp.decrypt_file(raw_filepath, processed_filepath) self.gcs.put(processed_filepath, f"infutor/processed/{self.today}/{processed_file}") rm(processed_filepath)
def _process_zip_file(self, item): raw_filepath = f"{self.working_dir}/raw/{item}" processed_dir = get_processed_name(item) processed_dirpath = f"{self.working_dir}/processed/{processed_dir}" with zipfile.ZipFile(raw_filepath, 'r') as zip_file: zip_file.extractall(processed_dirpath) self.gcs.put(processed_dirpath, f"infutor/processed/{self.today}/{processed_dir}") rm(processed_dirpath)
def process_gcs_item(self, item): filename = item raw_filepath = f"{self.working_dir}/raw/{filename}" mkdir(os.path.dirname(raw_filepath)) self.gcs.get(f"infutor/raw/{self.today}/{filename}", raw_filepath) if is_zip_file(filename): self._process_zip_file(filename) elif is_pgp_file(filename): self._process_pgp_file(filename) else: self._ignore_process_file(filename) rm(raw_filepath)
def process_remote_item(self, item): filename = rename_with_date(item, self.today) raw_filepath = f"{self.working_dir}/raw/{filename}" mkdir(os.path.dirname(raw_filepath)) self.sftp.get(item, raw_filepath) self.gcs.put(raw_filepath, f"infutor/raw/{self.today}/{filename}") if is_zip_file(filename): self._process_zip_file(filename) elif is_pgp_file(filename): self._process_pgp_file(filename) else: self._ignore_process_file(filename) rm(raw_filepath)
def run(opts): # configure logging logger = logging.getLogger() logger.setLevel(logging.INFO) logging.basicConfig(format='%(asctime)s %(message)s') logging.info(f"opts.dry_run: {opts.dry_run}") working_dir = "/tmp/infutor" today = datetime.date.today().strftime("%m-%Y") # import pgp key into keyring pgp = PGP(opts.project_id, opts.pgp_key_secret, opts.pgp_passphrase_secret) pgp.import_key() # list remote SFTP files sftp_password = get_secret(opts.project_id, opts.sftp_password_secret, "latest") sftp = SFTP(hostname=opts.sftp_hostname, port=opts.sftp_port, username=opts.sftp_username, password=sftp_password, root=opts.sftp_root) remote_files = sftp.list_files() logging.info(f"sftp.list_files(): {remote_files}") # list raw files in GCS gcs = GCS(project=opts.project_id, bucket=opts.bucket_name) uploaded_raw_files = gcs.list_files(prefix=f"infutor/raw/{today}/") logging.info( f"gcs.list_files(prefix='infutor/raw/{today}/'): {uploaded_raw_files}") # configure processor processor = Processor(working_dir=working_dir, sftp=sftp, gcs=gcs, pgp=pgp, today=today) # get list of unprocessed remote files remote_job_queue = [] for filename in remote_files: if rename_with_date(filename, today) not in uploaded_raw_files: remote_job_queue.append(filename) logging.info(f"remote_job_queue: {remote_job_queue}") # process remote files count = 0 if not opts.dry_run: for item in remote_job_queue: try: processor.process_remote_item(item) count += 1 logging.info( f"Processed {count}/{len(remote_job_queue)} remote files.") except Exception as err: logging.info(traceback.format_exc()) pass # get list of unprocessed and uploaded (orphaned) files gcs_job_queue = [] for filename in uploaded_raw_files: processed_name = get_processed_name(filename) if processed_name.endswith('/'): is_processed = len( gcs.list_files( prefix=f"infutor/processed/{today}/{processed_name}")) > 0 else: is_processed = gcs.exists( dst=f"infutor/processed/{today}/{processed_name}") if not is_processed: gcs_job_queue.append(filename) logging.info(f"gcs_job_queue: {gcs_job_queue}") # process orphaned files if not opts.dry_run: for item in gcs_job_queue: try: processor.process_gcs_item(item) except Exception as err: logging.info(traceback.format_exc()) pass # remove working dir rm(working_dir)