def report_clean(rows, datastudio=False, nulls=False): """ Helper to fix DBM report issues for BigQuery and ensure schema compliance. Memory efficiently cleans each row by fixing: * Strips header and footer to preserve only data rows. * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery. * Changes data format to match data studio if datastusio=True. * Changes cell string Unknown to blank ( None ) if nulls=True. Usage example: ``` filename, report = report_file(...) rows = report_to_rows(report) rows = report_clean(rows, project.task.get('datastudio', False)) ``` Args: * rows: (iterator) Rows to clean. Returns: * Iterator of cleaned rows. """ if project.verbose: print 'DBM Report Clean' first = True last = False date = None for row in rows: # stop if no data returned if row == ['No data returned by the reporting service.']: break # stop at blank row ( including sum row ) if not row or row[0] is None or row[0] == '': break # find 'Date' column if it exists if first: try: date = row.index('Date') except ValueError: pass if datastudio: row = [column_header_sanitize(cell) for cell in row] # check if data studio formatting is applied if datastudio and date is not None: row[date] = 'Report_Day' if first else row[date].replace('/', '') # remove unknown columns ( which throw off schema on import types ) if nulls: row = ['' if cell.strip() == 'Unknown' else cell for cell in row] # return the row yield row # not first row anymore first = False
def report_clean(rows): """ Helper to fix DCM report issues for BigQuery and ensure schema compliance. Memory efficiently cleans each row by fixing: * Strips header and footer to preserve only data rows. * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery. * removes '-' as columns * Changes data format to match data studio if datastusio=True. Usage example: ``` filename, report = report_file(...) rows = report_to_rows(report) rows = report_clean(rows) ``` Args: * rows: (iterator) Rows to clean. Returns: * Iterator of cleaned rows. """ if project.verbose: print('DCM REPORT CLEAN') first = True last = False # find start of report for row in rows: if row and row[0] == 'Report Fields': break # process the report for row in rows: # quit if empty report if 'No data returned by the reporting service.' in row: break # stop parsing if end of data if not row or row[0] == 'Grand Total:': break # find 'Date' column if it exists if first: try: date_column = row.index('Date') row[date_column] = 'Report_Day' except ValueError: pass row = [column_header_sanitize(cell) for cell in row] # remove not set columns ( which throw off schema on import types ) row = ['' if cell.strip() in ('(not set)', '-') else cell for cell in row] # return the row yield row # not first row anymore first = False
def dt_schema(header): schema = [] for h in header: h = column_header_sanitize(h) schema.append({ 'name': h, 'type': DT_Field_Lookup.get(h, 'STRING'), 'mode': 'NULLABLE' }) return schema
def email_read(): # process only most recent message email = get_email_messages( project.task['auth'], project.task['read']['from'], project.task['read']['to'], project.task['read'].get('subject', None), project.task['read'].get('link', None), project.task['read'].get('attachment', None), download=True ) # only take the most recent email try: email = next(email) except: traceback.print_exc() if project.verbose: print('NO EMAILS FOUND') exit() if project.verbose: print('EMAIL:', email['subject']) # loop all attached files for filename, data in email_files(email): if project.verbose: print('EMAIL FILENAME:', filename) # decompress if necessary if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows, column_header_sanitize(sheet)) # if csv, save directly elif filename.endswith('.csv'): rows = csv_to_rows(data.read().decode()) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows) else: if project.verbose: print('UNSUPPORTED FILE:', filename)
def report_schema(headers): """ Helper to determine the schema of a given set of report headers. Using a match table generated from the DCM proto, each report header is matched to its type and a schema is assembled. If not found defaults to STRING. Usage example: ``` filename, report = report_file(...) rows = report_to_rows(report) rows = report_clean(rows) schema = report_schema(next(rows)) ``` Args: * headers: (list) First row of a report. Returns: * JSON schema definition. """ schema = [] for header_name in headers: header_sanitized = column_header_sanitize(header_name) # first try exact match header_type = DCM_Field_Lookup.get(header_sanitized) # second try to match end for custom field names ( activity reports ) if header_type is None: for field_name, field_type in DCM_Field_Lookup.items(): if header_sanitized.endswith('_' + field_name): header_type = field_type break # finally default it to STRING if header_type is None: header_type = 'STRING' schema.append({ 'name': header_sanitized, 'type': header_type, 'mode': 'NULLABLE' }) return schema
def get_schema(sheet): return [{ "name": column_header_sanitize(column.title), "type": SMARTSHEET_TYPES.get(str(column.type), "STRING"), "mode": "NULLABLE" } for column in sheet.columns]
def get_schema(sheet_or_report): return [{ 'name': column_header_sanitize(column.title), 'type': SMARTSHEET_TYPES.get(str(column.type), 'STRING'), 'mode': 'NULLABLE' } for column in sheet_or_report.columns]
def report_clean(rows, datastudio=False, nulls=False): """ Helper to fix DBM report issues for BigQuery and ensure schema compliance. Memory efficiently cleans each row by fixing: * Strips header and footer to preserve only data rows. * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery. * Changes date values to use '-' instead of '/' for BigQuery compatibility. * Changes cell string Unknown to blank ( None ) if nulls=True. Usage example: ``` filename, report = report_file(...) rows = report_to_rows(report) rows = report_clean(rows, project.task.get('datastudio', False)) ``` Args: * rows: (iterator) Rows to clean. Returns: * Iterator of cleaned rows. """ if project.verbose: print 'DBM Report Clean' first = True last = False date = None for row in rows: # stop if no data returned if row == ['No data returned by the reporting service.']: break # stop at blank row ( including sum row ) if not row or row[0] is None or row[0] == '': break # sanitizie header row if first: try: date_column = row.index('Date') row[date_column] = 'Report_Day' except ValueError: pass if datastudio: row = [column_header_sanitize(cell) for cell in row] # for all data rows clean up cells else: # check if data studio formatting is applied reformat the dates if datastudio: row = [ cell.replace('/', '-') if isinstance(cell, basestring) and len(cell) == 4 + 1 + 2 + 1 + 2 and cell[4] == '/' and cell[7] == '/' else cell for cell in row ] # 5x faster than regexp # remove unknown columns ( which throw off schema on import types ) if nulls: row = ['' if cell.strip() == 'Unknown' else cell for cell in row] # return the row yield row # not first row anymore first = False