def _refresh_credentials(self, project_credentials: Dict[str, str], user_token: Dict[str, str]) -> Dict[str, str]: # Remove top-level element secrets = project_credentials[ 'web'] if 'web' in project_credentials else project_credentials[ 'installed'] # Init credentials creds = google.oauth2.credentials.Credentials( None, refresh_token=user_token['refresh_token'], token_uri="https://accounts.google.com/o/oauth2/token", client_id=secrets['client_id'], client_secret=secrets['client_secret']) # Force Refresh token creds.refresh(google.auth.transport.requests.Request()) refresh_token_details = { 'access_token': creds.token, 'refresh_token': creds.refresh_token } Cloud_Storage.write_file( bucket=self.bucket, file=self.client_token, data=json.dumps(refresh_token_details).encode('utf-8')) return creds
def _read_json(self, config: ManagerConfiguration) -> List[Dict[str, Any]]: """Read the contens of a file as a json object. Args: config (ManagerConfiguration): the manager configuration Returns: List[Dict[str, Any]]: the file contents as json """ objects = [] if config.type == ManagerType.BIG_QUERY: query = ManagerInput(config) job = query.execute() objects = [dict(row) for row in job] else: if config.file: if config.gcs_stored: content = \ Cloud_Storage(project=config.project, email=config.email).fetch_file(bucket=self.bucket, file=config.file) objects = json.loads(content) else: with open(config.file) as rpt: objects = json.loads(''.join(rpt.readlines())) else: objects = self.firestore.list_documents(self.report_type) return objects
def token_details(self) -> Dict[str, Any]: """The users's refresh and access token.""" # TODO: Remove the GCS check when fully migrated to Firestore. return self.datastore.get_document(type=Type._ADMIN, id='auth', key=self.encode_key(self._email)) or \ json.loads(Cloud_Storage.fetch_file(bucket=self.bucket, file=self.client_token))
def storage(self) -> Cloud_Storage: """Fetch the GCS storage client on demand. Returns: Cloud_Storage: storage client """ return Cloud_Storage()
def __init__(self, email: str, project: str, adh_customer: str, adh_query: str, api_key: str, days: int, dest_project: str=None, dest_dataset: str=None): """Constructor Setus up the ADH helper Arguments: email {str} -- authenticated user email (for the token) project {str} -- GCP project adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES adh_query {str} -- ADH query id api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP) days {int} -- Lookback window (default: 60) dest_project {str} -- target GCP project for results dest_dataset {str} -- target BQ dataset for results """ self.email = email self.project = project self.adh_customer = adh_customer self.adh_query = adh_query self.api_key = api_key self.days = days self.dest_project = dest_project self.dest_dataset = dest_dataset self.credentials = Credentials(email=email, project=project) self.storage = Cloud_Storage(email=email, project=project) self.firestore = Firestore(email=email, project=project)
def project_credentials(self) -> Dict[str, Any]: """The project credentials. TODO: Remove the GCS check when fully migrated to Firestore.""" return self.datastore.get_document(type=Type._ADMIN, id='auth', key='client_secret') or \ json.loads(Cloud_Storage.fetch_file(bucket=self.bucket, file='client_secrets.json'))
def upload_report(self, bucket: str, report_details: Dict[str, Any], input_buffer: BytesIO = None): output_buffer = StringIO() #BytesIO() try: if not input_buffer: input_buffer = BytesIO() request = requests.Download(report_details['url'], stream=input_buffer) request.consume(transport=self.transport) logging.info('Report data size: {bytes}'.format(bytes=0)) input_buffer.seek(0) soup = self._soupify(input_buffer) # del input_buffer headers = soup.find('thead').find_all('th') fieldnames = [] for header in headers: fieldnames.append(CSVHelpers.sanitize_string(header.string)) rows = soup.find('tbody').find_all('tr') report_data = [] for row in rows: data = [] for col in row.contents: data.append(col.string) report_data.append(dict(zip(fieldnames, data))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) writer.writeheader() for row in report_data: writer.writerow(row) output_buffer.seek(0) Cloud_Storage.write_file(bucket=bucket, file=f"{report_details['id']}.csv", data=output_buffer.getvalue()) report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) except Exception as e: logging.error(e)
def __init__(self, in_cloud: bool = True, email: str = None, project: str = None): """ Initialize Credential Class """ self.project = project self.email = email self.bucket = f'{project}-report2bq-tokens' self.client_token = f'{email}_user_token.json' self.project_credentials = json.loads( Cloud_Storage.fetch_file(bucket=self.bucket, file='client_secrets.json')) self.token_details = json.loads( Cloud_Storage.fetch_file(bucket=self.bucket, file=self.client_token))
def validate(self, config: ManagerConfiguration, **unused) -> None: sa360_report_definitions = \ self.firestore.get_document(self.report_type, '_reports') validation_results = [] sa360_objects = self._read_json(config) for sa360_object in sa360_objects: if sa360_object == '_reports': continue creds = Credentials(project=config.project, email=sa360_object['email']) sa360_service = \ discovery.get_service(service=Service.SA360, credentials=creds) (valid, validation) = \ self._report_validation(sa360_report_definitions, sa360_object, sa360_service) validation_results.append(validation) if validation_results: if config.type == ManagerType.BIG_QUERY: results = [json.loads(r.to_json()) for r in validation_results] # write to BQ client = bigquery.Client(project=config.project) table = client.dataset( config.dataset).table('sa360_validation') job_config = bigquery.LoadJobConfig( write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON) client.load_table_from_json(results, table, job_config=job_config) else: csv_output = f'{config.email}-<now>-validation.csv' if config.gcs_stored: csv_bytes = io.StringIO() writer = csv.DictWriter(csv_bytes, fieldnames=Validation.keys(), quoting=csv.QUOTE_ALL) writer.writeheader() writer.writerows([r.to_dict() for r in validation_results]) Cloud_Storage(project=config.project, email=config.email).write_file( bucket=self.bucket, file=csv_output, data=csv_bytes.getvalue()) else: with open(csv_output, 'w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=Validation.keys(), quoting=csv.QUOTE_ALL) writer.writeheader() writer.writerows( [r.to_dict() for r in validation_results])
def exec_module(self, module: types.ModuleType): try: # Fetch the code here as string: # GCS? BQ? Firestore? All good options filename = module.__name__.split('.')[-1] code = Cloud_Storage.fetch_file( bucket=(f'{os.environ.get("GCP_PROJECT")}-report2bq-postprocessor'), file=f'{filename}.py' ) exec(code, vars(module)) except: raise ModuleNotFoundError()
def store_credentials(self, creds: credentials.Credentials) -> None: """Stores the credentials. This function uses the datastore to store the user credentials for later. Args: creds (credentials.Credentials): the user credentials.""" # TODO: Remove the GCS write when fully migrated to Firestore. if self._email: key = self.encode_key(self._email) refresh_token_details = { 'access_token': creds.token, 'refresh_token': creds.refresh_token, '_key': key } self.datastore.update_document( type=Type._ADMIN, id='auth', new_data={key: json.loads(creds.to_json())}) Cloud_Storage.write_file( bucket=self.bucket, file=self.client_token, data=json.dumps(refresh_token_details).encode('utf-8'))
def report_manager(event: Dict[str, Any], context=None) -> None: """Processes files added to the *_report_manager bucket. Arguments: event (Dict[str, Any]): data sent from the PubSub message context (Dict[str, Any]): context data. unused """ logging.info(event) project = os.environ.get('GCP_PROJECT') bucket_name = event['bucket'] file_name = event['name'] *n, e = file_name.split('/')[-1].split('.') (name, extension) = ('.'.join(n).lower(), e.lower()) if f := { f'{project}-report2bq-ga360-manager': GA360ReportManager, f'{project}-report2bq-sa360-manager': SA360Manager, }.get(bucket_name): logging.info('Processing file %s', file_name) try: args = { 'report': name, 'project': project, 'file': file_name, 'gcs_stored': True, 'action': extension, } f().manage(**args) Cloud_Storage.rename(bucket=bucket_name, source=file_name, destination=f'{file_name}.processed') except NotImplementedError: logging.error( 'Extension command %s is not a valid action. Ignoring.', extension)
def oauth_complete(self, request: Request): logging.info(request.args) state = request.args.get('state', type=str) firestore = Firestore() email, project = firestore.get_oauth_state(state) project_credentials = json.loads(Files.fetch_file( '{project}-report2bq-tokens'.format(project=project), 'client_secrets.json'), encoding='utf-8') _flow = flow.Flow.from_client_config(client_config=project_credentials, scopes=self.SCOPES) _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete" r = urlparse(request.url) auth_response = urlunparse( ['https', r.netloc, r.path, r.params, r.query, r.fragment]) _flow.fetch_token(authorization_response=auth_response) logging.info(_flow.credentials) token_details = { 'access_token': _flow.credentials.token, 'refresh_token': _flow.credentials.refresh_token } Cloud_Storage.write_file( '{project}-report2bq-tokens'.format(project=project), '{email}_user_token.json'.format(email=email), json.dumps(token_details).encode('utf-8')) firestore.delete_oauth_state(state=state) return 'Ok'
def stream_to_gcs(self, report_details: Dict[str, Any], run_config: Dict[str, Any]) -> None: """Multi-threaded stream to GCS Arguments: bucket (str): GCS Bucket report_details (dict): Report definition """ queue = Queue() report_id = run_config['report_id'] # chunk_multiplier is set in the environment, but defaults to 64 - this leads to a # 64M chunk size we can throw around. Given the memory constraints of a cloud function # this seems like a good, safe number. chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = BytesIO() streamer = \ ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(), creds=credentials.Credentials( email=self.email, project=self.project).credentials, bucket_name=self.bucket, blob_name=f'{report_id}.csv', chunk_size=chunk_size, streamer_queue=queue) streamer.start() r = urllib.request.Request(report_details['files'][0]['url']) for header in self.creds.auth_headers: r.add_header(header, self.creds.auth_headers[header]) with closing(urlopen(r)) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) queue.put(chunk) chunk_id += 1 queue.join() streamer.stop()
def _read_email(self, file: str, gcs_stored: bool) -> str: """Read an email address from a file. Args: file (str): the file to process. gcs_stored (bool): is the file local or GCS? Returns: str: the email address """ if gcs_stored: email = str(Cloud_Storage().fetch_file(bucket=self.bucket, file=file), encoding='utf-8').strip() else: with open(file, 'r') as _command_file: email = _command_file.readline().strip() return email
def _stream_report_to_gcs(self, report_details: Dict[str, Any], run_config: Dict[str, Any]) -> None: """Multi-threaded stream to GCS Arguments: bucket {str} -- GCS Bucket report_details {dict} -- Report definition """ queue = Queue() report_id = run_config['report_id'] chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = BytesIO() streamer = ThreadedGCSObjectStreamUpload(client=Cloud_Storage.client(), bucket_name=self.bucket, blob_name=f'{report_id}.csv', chunk_size=chunk_size, queue=queue) streamer.start() r = urllib.request.Request(report_details['files'][0]['url']) for header in self.creds.get_auth_headers(): r.add_header(header, self.creds.get_auth_headers()[header]) with closing(urlopen(r)) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) queue.put((chunk_id, chunk)) chunk_id += 1 queue.join() streamer.stop()
def stream_to_gcs(self, bucket: str, report_data: ReportConfig): """Streams the report CSV to Cloud Storage. Arguments: bucket (str): GCS Bucket report_data (dict): Report definition """ if not report_data.report_file: return queue = Queue() report_id = report_data.id file_id = report_data.report_file.id chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = io.BytesIO() download_request = self.service.files().get_media(reportId=report_id, fileId=file_id) downloader = http.MediaIoBaseDownload(out_file, download_request, chunksize=chunk_size) # Execute the get request and download the file. streamer = ThreadedGCSObjectStreamUpload( creds=credentials.Credentials(email=self.email, project=self.project).credentials, client=Cloud_Storage.client(), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_id), chunk_size=chunk_size, streamer_queue=queue) streamer.start() download_finished = False first = True while download_finished is False: status, download_finished = downloader.next_chunk() # Last chunk, drop the "Grand Total" if download_finished: total_pos = out_file.getvalue().rfind(b'Grand Total') if total_pos != -1: out_file.truncate(total_pos) # First chunk, skip the pre-header if first: csv_start = self._find_first_data_byte(out_file.getvalue()) out_file.seek(0 if csv_start == -1 else csv_start) first = False else: out_file.seek(0) logging.info( 'Downloader status %s, %s of %s', f'{(status.resumable_progress/status.total_size):3.2%}', f'{status.resumable_progress:,}', f'{status.total_size:,}') chunk = out_file.read(chunk_size) queue.put(chunk) out_file.seek(0) out_file.truncate(0) queue.join() streamer.stop()
def stream_to_gcs(self, bucket: str, report_details: Dict[str, Any]) -> None: """Multi-threaded stream to GCS Arguments: bucket {str} -- GCS Bucket report_details {dict} -- Report definition """ if not 'current_path' in report_details: return queue = Queue() report_id = report_details['id'] chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = io.BytesIO() streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_id), chunk_size=chunk_size, queue=queue) streamer.start() with closing(urlopen(report_details['current_path'])) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) if _downloaded >= _report_size: # last chunk... trim to footer if there is one, or last blank line if not # NOTE: if no blank line (partial file?) NO TRIMMING WILL HAPPEN # THIS SHOULD NEVER BE THE CASE last = io.BytesIO(chunk) # find the footer blank_line_pos = chunk.rfind(b'\n\n') # if we don't find it, there's no footer. if blank_line_pos == -1: logging.error( 'No footer delimiter found. Writing entire final chunk as is.' ) queue.put((chunk_id, chunk)) else: # read the footer last.seek(blank_line_pos) footer = last.readlines() group_count = sum( g.startswith(b'Group By:') for g in footer) total_block_start = chunk.rfind(b'\n' + b',' * group_count) if total_block_start == -1: last.truncate(blank_line_pos) else: last.truncate(total_block_start) queue.put((chunk_id, last.getvalue())) # break else: queue.put((chunk_id, chunk)) chunk_id += 1 queue.join() streamer.stop()
def stream_to_gcs(self, bucket: str, report_details: ReportConfig) -> None: """Streams the report CSV to Cloud Storage. Arguments: bucket (str): GCS Bucket report_details (dict): Report definition """ if not report_details.current_path: return queue = Queue() report_id = report_details.id chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = io.BytesIO() streamer = \ ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(), creds=credentials.Credentials( email=self.email, project=self.project).credentials, bucket_name=bucket, blob_name=f'{report_id}.csv', chunk_size=chunk_size, streamer_queue=queue) streamer.start() with closing(urlopen(report_details.current_path)) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) logging.info('Report is %s bytes', f'{_report_size:,}') while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) if _downloaded >= _report_size: # last chunk... trim to footer if there is one, or last blank line if not # NOTE: if no blank line (partial file?) NO TRIMMING WILL HAPPEN # THIS SHOULD NEVER BE THE CASE last = io.BytesIO(chunk) # find the footer blank_line_pos = chunk.rfind(b'\n\n') # if we don't find it, there's no footer. if blank_line_pos == -1: logging.info( ('No footer delimiter found. Writing entire ' 'final chunk as is.')) queue.put(chunk) else: # read the footer last.seek(blank_line_pos) footer = last.readlines() group_count = sum( g.startswith(b'Group By:') for g in footer) total_block_start = chunk.rfind(b'\n' + b',' * group_count) if total_block_start == -1: last.truncate(blank_line_pos) else: last.truncate(total_block_start) queue.put(last.getvalue()) # break else: queue.put(chunk) chunk_id += 1 queue.join() streamer.stop()
def process(self, data: Dict[str, Any], context): """Check all the running jobs Arguments: event {Dict[str, Any]} -- data sent from the PubSub message context {Dict[str, Any]} -- context data. unused """ firestore = Firestore(in_cloud=True, email=None, project=None) documents = firestore.get_all_jobs() for document in documents: for T in [t for t in Type if not t.name.startswith('_')]: config = firestore.get_report_config(T, document.id) if config: if config.get('dest_project'): # authenticate against supplied project with supplied key project = config.get('dest_project') or os.environ.get( 'GCP_PROJECT') client_key = json.loads( Cloud_Storage.fetch_file( bucket= f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens", file=f"{config['email']}_user_token.json")) server_key = json.loads( Cloud_Storage.fetch_file( bucket= f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens", file='client_secrets.json')) client_key['client_id'] = ( server_key.get('web') or server_key.get('installed')).get('client_id') client_key['client_secret'] = ( server_key.get('web') or server_key.get('installed')).get('client_secret') logging.info(client_key) creds = Credentials.from_authorized_user_info( client_key) bq = bigquery.Client(project=project, credentials=creds) else: bq = bigquery.Client() api_repr = document.get().to_dict() if api_repr: try: job = LoadJob.from_api_repr(api_repr, bq) job.reload() if job.state == 'DONE': if job.error_result: logging.error(job.errors) self._handle_finished(job=job, id=document.id, config=config, report_type=T) firestore.mark_import_job_complete( document.id, job) except Exception as e: logging.error( f"""Error loading job {document.id} for monitoring.""" ) break
def stream_to_gcs(self, bucket: str, report_data: dict): """Multi-threaded stream to GCS Arguments: bucket {str} -- GCS Bucket report_data {dict} -- Report definition """ if not 'report_file' in report_data: return queue = Queue() report_id = report_data['id'] file_id = report_data['report_file']['id'] chunk_size = 16 * 1024 * 1024 out_file = io.BytesIO() download_request = self.service().files().get_media(reportId=report_id, fileId=file_id) downloader = http.MediaIoBaseDownload(out_file, download_request, chunksize=chunk_size) # Execute the get request and download the file. streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_id), chunk_size=chunk_size, queue=queue) streamer.start() download_finished = False chunk_id = 0 while download_finished is False: status, download_finished = downloader.next_chunk() # Last chunk, drop the "Grand Total" shit if download_finished: total_pos = out_file.getvalue().rfind(b'Grand Total') if total_pos != -1: out_file.truncate(total_pos) # First chunk, skip the pre-header shit if chunk_id == 0: csv_start = self.find_first_data_byte(out_file.getvalue()) if csv_start == -1: out_file.seek(0) else: out_file.seek(csv_start) else: out_file.seek(0) logging.info( 'Downloader status {percent:3.2%}, chunk {chunk} ({progress} of {size})' .format(percent=(status.resumable_progress / status.total_size), progress=status.resumable_progress, size=status.total_size, chunk=chunk_id)) chunk = out_file.read(chunk_size) # chunk = out_file.getvalue() queue.put((chunk_id, chunk)) chunk_id += 1 out_file.seek(0) out_file.truncate(0) queue.join() streamer.stop()
def _stream_processor(self, bucket: str, report_details: Dict[str, Any], repeatable: bool = False) -> BytesIO: repeater = BytesIO() report_url = report_details['url'] remainder = b'' queue = Queue() output_buffer = StringIO() html_chunk_size = 2048 * 1024 chunk_size = 1024 * 1024 streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(credentials=self.creds), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_details['id']), chunk_size=chunk_size, queue=queue) streamer.daemon = True streamer.start() try: chunk_id = 0 conn = self._get_connection(report_url) _stream = conn.iter_content(chunk_size=html_chunk_size) source_size = 0 done = False fieldnames = None while not done: # logging.info(f'Processing chunk {chunk_id}') # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}') chunk = BytesIO() chunk.write(remainder) remainder = b'' block, done = self._next_chunk(_stream, html_chunk_size) source_size += len(block) # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}') if repeatable: repeater.write(block) chunk.write(block) if len(chunk.getvalue()) < html_chunk_size and not done: continue # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes') chunk.seek(0) if chunk_id == 0: fieldnames, chunk = self._find_fieldnames(buffer=chunk) # find last </tr> on any section but the last, chop off the last portion and store last_tr_pos = chunk.getvalue().rfind(b'</tr>') if last_tr_pos == -1: # logging.debug(f'HALP! {chunk.getvalue()}') remainder = chunk.getvalue() continue else: last_tr_pos += 5 chunk.seek(last_tr_pos) remainder = chunk.read() # logging.debug(f'Remainder: {remainder}') chunk.truncate(last_tr_pos) rows = [] while True: tr, chunk = self._extract_keys(chunk, 'tr') if chunk: rows.append([ unescape(field) for field in re.findall( r'\<td[^>]*\>([^<]*)\<\/td\>', tr) ]) else: break # queue for upload report_data = [] for row in rows: report_data.append(dict(zip(fieldnames, row))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) if chunk_id == 0: writer.writeheader() [writer.writerow(row) for row in report_data] output_buffer.seek(0) # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}') queue.put((chunk_id, output_buffer.getvalue().encode('utf-8'))) chunk_id += 1 chunk = BytesIO() output_buffer.seek(0) output_buffer.truncate(0) logging.info(f'SA360 report length: {source_size:,} bytes') queue.join() streamer.stop() report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) return repeater except Exception as e: logging.error(e)
def stream_to_gcs(self, bucket: str, report_details: ReportConfig) \ -> Tuple[List[str], List[str]]: """Streams the data to Google Cloud Storage. This is to allow us to process much larger files than can be easily handled in toto in memory. Now we're limited to length of execution (900s) rather than size of 'stuff' (<2Gb). The response from SA360 is a _nasty_ piece of Microsoft Office format XML which has to be parsed and converted to a digestible CSV. Raises: SA360Exception: A custom SA360 exception because there can be a server error returned when requesting, but the error is in text and the HTTP code returned is _always_ a 200. This is why the function is allowed to retry, as the error is usually transient and caused by a failure to connect to SA360's reporting back end. Returns: (fieldnames: List[str], fieldtypes: List[str]): the field names and types in the report. """ report_url = report_details.url remainder = b'' queue = Queue() output_buffer = StringIO() # size of pieces of xml we can safely download from the web report. html_chunk_size = 2048 * 1024 chunk_size = self.chunk_multiplier * 1024 * 1024 streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(credentials=self.creds), creds=credentials.Credentials(email=self.email, project=self.project).credentials, bucket_name=bucket, blob_name=f'{report_details.id}.csv', chunk_size=chunk_size, streamer_queue=queue) streamer.daemon = True streamer.start() chunk_id = 0 conn = self.get_connection(report_url) _stream = conn.iter_content(chunk_size=html_chunk_size) source_size = 0 first = True done = False fieldnames = None fieldtypes = None while not done: chunk = BytesIO() chunk.write(remainder) remainder = b'' block, done = self.next_chunk(_stream, html_chunk_size) source_size += len(block) chunk.write(block) if len(chunk.getvalue()) < html_chunk_size and not done: continue chunk.seek(0) if first: fieldnames, chunk = self.find_fieldnames(buffer=chunk) if len(fieldnames) == 1 and fieldnames[0] == 'Error': error = \ unescape(re.sub(r'<[^.]+>', '', chunk.getvalue().decode('utf-8'))) # logging.error('SA360 Error: %s', error) streamer.stop() raise SA360Exception(error) # find last </tr> on any section but the last, chop off the last # portion and store last_tr_pos = chunk.getvalue().rfind(b'</tr>') if last_tr_pos == -1: remainder = chunk.getvalue() continue else: last_tr_pos += 5 chunk.seek(last_tr_pos) remainder = chunk.read() chunk.truncate(last_tr_pos) rows = [] while True: tr, chunk = self.extract_keys(chunk, 'tr') if chunk: rows.append([ unescape(field) for field in re.findall( r'\<td[^>]*\>([^<]*)\<\/td\>', tr) ]) else: break # queue for upload report_data = [] for row in rows: report_data.append(dict(zip(fieldnames, row))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) if first: writer.writeheader() [writer.writerow(row) for row in report_data] output_buffer.seek(0) if first: _, fieldtypes = \ csv_helpers.get_column_types( BytesIO(output_buffer.getvalue().encode('utf-8'))) queue.put(output_buffer.getvalue().encode('utf-8')) chunk_id += 1 first = False chunk = BytesIO() output_buffer.seek(0) output_buffer.truncate(0) logging.info(f'SA360 report length: {source_size:,} bytes') queue.join() streamer.stop() report_details.schema = \ csv_helpers.create_table_schema(fieldnames, fieldtypes) return fieldnames, fieldtypes
def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob: """Begin CSV import Create and start the Big Query import job. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ if config.get('dest_project'): # authenticate against supplied project with supplied key project = config.get('dest_project') or os.environ.get('GCP_PROJECT') client_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file=f"{config['email']}_user_token.json" )) server_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file='client_secrets.json' )) client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id') client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret') logging.info(client_key) creds = Credentials.from_authorized_user_info(client_key) bq = bigquery.Client(project=project, credentials=creds) else: project = os.environ.get('GCP_PROJECT') bq = bigquery.Client() dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq' table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name)) logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}') json_schema = config['schema'] schema = [] _json_schema = [] # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config for field in json_schema: f = bigquery.schema.SchemaField(name=field['name'], field_type=field['type'], mode=field['mode']) schema.append(f) _json_schema.append(f'{field["name"]}: {field["type"]}') table_ref = bq.dataset(dataset).table(table_name) # Default action is to completely replace the table each time. If requested, however then # we can do an append for (say) huge jobs where you would see the table with 60 days once # and then append 'yesterday' each day. if config.get('append', False): if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema): config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema]) target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema]) self._email_error( email=config['email'], message=f''' Mismatched schema for {project}.{dataset}.{table_name}, trying anyway Report has schema: {config_schema} Table has schema: {target_schema} ''' ) logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway") import_type = bigquery.WriteDisposition.WRITE_APPEND else: import_type = bigquery.WriteDisposition.WRITE_TRUNCATE job_config = bigquery.LoadJobConfig() job_config.write_disposition = import_type # Assume a CSV header is the first line unless otherwise specified in the report's own config job_config.skip_leading_rows = config.get('csv_header_length', 1) job_config.source_format = bigquery.SourceFormat.CSV job_config.schema = schema # Allow a few errors, just in case job_config.max_bad_records = 10 # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do job_config.allow_jagged_rows = True uri = f'gs://{bucket_name}/{file_name}' load_job = bq.load_table_from_uri( uri, table_ref, job_config=job_config ) # API request logging.info(f'Starting CSV import job {load_job.job_id}') return load_job