def report_to_rows(report): """ Helper to convert DCM files into iterator of rows, memory efficient. Usage example: ``` filename, report = report_file(...) rows = report_to_rows(report) ``` Args: * report: (iterator or file) Either an iterator or file that will be converted to rows. Returns: * Iterator of lists representing each row. """ # if reading from stream if type(report) is GeneratorType: leftovers = '' for chunk in report: data, extra = chunk.rsplit('\n', 1) for row in csv_to_rows(leftovers + data): yield row leftovers = extra # if reading from buffer else: for row in csv_to_rows(report): yield row
def sdf_to_bigquery(config, auth, sdf_zip_file, project_id, dataset, time_partitioned_table, create_single_day_table, table_suffix=''): with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d: file_names = d.namelist() for file_name in file_names: if config.verbose: print('SDF: Loading: ' + file_name) with d.open(file_name) as sdf_file: rows = csv_to_rows(sdf_file.read().decode('utf-8')) if not rows: if config.verbose: print('SDF: Empty file ' + file_name) continue table_name = file_name.split('.')[0].replace( '-', '_') + table_suffix schema = sdf_schema(next(rows)) # Check if each SDF should have a dated table if create_single_day_table: table_name_dated = table_name + date.today().strftime( '%Y_%m_%d') # Create table and upload data table_create(auth, project_id, dataset, table_name_dated) rows_to_table(config, auth, project_id, dataset, table_name_dated, rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Create end result table if it doesn't already exist if not table_exists(config, auth, project_id, dataset, table_name): table_create(config, auth, project_id, dataset, table_name, is_time_partition=time_partitioned_table) rows_to_table(config, auth, project_id, dataset, table_name, rows, schema=schema, skip_rows=1, disposition='WRITE_APPEND' if time_partitioned_table else 'WRITE_TRUNCATE')
def conversions_download(): if project.verbose: print('CONVERSION DOWNLOAD') # pull from bigquery if specified if 'bigquery' in project.task: if project.verbose: print('READING BIGQUERY') rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], 'SELECT * FROM %s' % project.task['bigquery']['table'], legacy=project.task['bigquery'].get('legacy', True)) for row in rows: yield row # pull from sheets if specified if 'sheets' in project.task: if project.verbose: print('READING SHEET') rows = sheets_read(project.task['auth'], project.task['sheets']['url'], project.task['sheets']['tab'], project.task['sheets']['range']) for row in rows: yield row # pull from csv if specified if 'csv' in project.task: if project.verbose: print('READING CSV FILE') with io.open(project.task['csv']['file']) as f: for row in csv_to_rows(f): if row[0] not in CSV_HEADERS: yield row
def email_read(): # process only most recent message email = get_email_messages( project.task['auth'], project.task['read']['from'], project.task['read']['to'], project.task['read'].get('subject', None), project.task['read'].get('link', None), project.task['read'].get('attachment', None), download=True ) # only take the most recent email try: email = next(email) except: traceback.print_exc() if project.verbose: print('NO EMAILS FOUND') exit() if project.verbose: print('EMAIL:', email['subject']) # loop all attached files for filename, data in email_files(email): if project.verbose: print('EMAIL FILENAME:', filename) # decompress if necessary if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows, column_header_sanitize(sheet)) # if csv, save directly elif filename.endswith('.csv'): rows = csv_to_rows(data.read().decode()) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows) else: if project.verbose: print('UNSUPPORTED FILE:', filename)
def lineitem_read(auth, advertisers=[], insertion_orders=[], lineitems=[]): """ Reads line item configurations from DBM. Bulletproofing: https://developers.google.com/bid-manager/v1/lineitems/downloadlineitems Args: * auth: (string) Either user or service. * advertisers (list) List of advertiser ids ( exclusive with insertion_orders and lineitems ). * insertion_orders (list) List of insertion_order ids ( exclusive with advertisers and lineitems ). * lineitems (list) List of ilineitem ids ( exclusive with insertion_orders and advertisers ). Returns: * Iterator of lists: https://developers.google.com/bid-manager/guides/entity-write/format """ service = get_service('doubleclickbidmanager', API_VERSION, auth) body = { 'format':'CSV', 'fileSpec':'EWF' } if advertisers: body['filterType'] = 'ADVERTISER_ID' body['filterIds'] = list(advertisers) # in case its a generator elif insertion_orders: body['filterType'] = 'INSERTION_ORDER_ID' body['filterIds'] = list(insertion_orders) # in case its a generator elif lineitems: body['filterType'] = 'LINE_ITEM_ID' body['filterIds'] = list(lineitems) # in case its a generator #print(body) result = API_Retry(service.lineitems().downloadlineitems(body=body)) for count, row in enumerate(csv_to_rows(result.get('lineItems', ''))): if count == 0: continue # skip header row[0] = int(row[0] or 0) # LineItem ID row[2] = int(row[2] or 0) # Partner ID row[11] = float(row[11] or 0) # IO Budget Amount row[18] = float(row[18] or 0) # LineItem Budget Amount row[21] = float(row[21] or 0) # LineItem Pacing Amount row[23] = int(row[23] or 0) # LineItem Frequency Exposures row[25] = int(row[25] or 0) # LineItem Frequency Amount row[26] = float(row[26] or 0) # Bid Price (CPM) row[28] = float(row[28] or 0) # Partner Revenue Amount yield row
def get_single_sdf_rows(auth, version, partner_id, file_types, filter_type, filter_ids_obj, desired_file_type): sdf_zip_file = sdf_download(auth, version, partner_id, file_types, filter_type, filter_ids_obj) with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d: file_names = d.namelist() for file_name in file_names: # make sure to only get the one file if desired_file_type != file_name.split('-')[1].split('.')[0] or 'Skipped' in file_name: continue if project.verbose: print('SDF: Loading: ' + file_name) with d.open(file_name) as sdf_file: rows = csv_to_rows(sdf_file.read().decode('utf-8')) return rows
def sdf_read(auth, file_types, filter_type, filter_ids, version='3.1'): """ Read sdf file from DV360 api https://developers.google.com/bid-manager/v1/sdf/download Args: * auth: (string) Either user or service. * file_types: (list[string]) List of the requested file types * filter_type: (string) Value of the type of filter ids to be expected * filter_ids: (list[int]) Filter ids for the api call Returns: * Rows of the sdf files requested """ service = get_service('doubleclickbidmanager', API_VERSION, auth) body = { "fileTypes": file_types, "filterType": filter_type, "filterIds": filter_ids, "version": version } result = API_Retry(service.sdf().download(body=body)) # Clean the date field in each row for key in result.keys(): date_idx_list = [] for count, row in enumerate(csv_to_rows(result.get(key, ''))): # Get date fields from header if count == 0: date_idx_list = _get_idx_sdf_date_fields(row) yield row # Clean and yield data rows else: yield _clean_sdf_row(row, date_idx_list)
def email_read(): # process only most recent message try: message = next( get_email_messages(project.task['auth'], project.task['read']['from'], project.task['read']['to'], project.task['read'].get('subject', None))) except StopIteration as e: if project.verbose: print('NO EMAILS FOUND') exit() if project.verbose: print('EMAIL:', get_subject(message)) files = [] if project.task['read'].get('attachment'): files.extend( get_email_attachments(project.task['auth'], message, project.task['read']['attachment'])) if project.task['read'].get('link'): files.extend( get_email_links(project.task['auth'], message, project.task['read']['link'], download=True)) for filename, data in files: if project.verbose: print('EMAIL FILENAME:', filename) # decompress if necessary if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) put_rows(project.task['auth'], project.task['read']['out'], rows, sheet) # if CM report elif project.task['read']['from'] == '*****@*****.**': rows = cm_report_to_rows(data.read().decode()) rows = cm_report_clean(rows) # if bigquery, remove header and determine schema schema = None if 'bigquery' in project.task['read']['out']: schema = cm_report_schema(next(rows)) project.task['read']['out']['bigquery']['schema'] = schema project.task['read']['out']['bigquery']['skip_rows'] = 1 put_rows(project.task['auth'], project.task['read']['out'], rows) # if dv360 report elif project.task['read']['from'] == '*****@*****.**': rows = dv360_report_to_rows(data.getvalue().decode()) rows = dv360_report_clean(rows) put_rows(project.task['auth'], project.task['read']['out'], rows) # if csv elif filename.endswith('.csv'): rows = csv_to_rows(data.read().decode()) rows = rows_header_sanitize(rows) put_rows(project.task['auth'], project.task['read']['out'], rows) else: if project.verbose: print('UNSUPPORTED FILE:', filename)
def main(): # get parameters parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent("""\ Command line to get table schema from BigQuery. Helps developers upload data to BigQuery and pull schemas. These are the most common BigQuery tasks when developing solutions. Examples: Display table schema: `python helper.py --project [id] --dataset [name] --table [name] -s [credentials]` Upload csv table: `python helper.py --project [id] --dataset [name] --table [name] --csv [file] --schema [file] -s [credentials]` Upload excel sheet: `python helper.py --project [id] --dataset [name] --table [name] --excel_file [file] --excel_sheet [name] --schema [file] -s [credentials]` """)) parser.add_argument('--dataset', help='name of BigQuery dataset', default=None) parser.add_argument('--table', help='name of BigQuery table', default=None) parser.add_argument('--csv', help='CSV file path', default=None) parser.add_argument('--schema', help='SCHEMA file path', default=None) parser.add_argument('--excel_workbook', help='Excel file path', default=None) parser.add_argument('--excel_sheet', help='Excel sheet name', default=None) # initialize project parser = commandline_parser(parser, arguments=('-u', '-c', '-s', '-v', '-p')) args = parser.parse_args() config = Configuration(user=args.user, client=args.client, service=args.service, verbose=args.verbose, project=args.project) auth = 'service' if args.service else 'user' schema = json.loads(args.schema) if args.schema else None if args.csv: with open(args.csv, 'r') as csv_file: rows = csv_to_rows(csv_file.read()) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) elif args.excel_workbook and args.excel_sheet: with open(args.excel_workbook, 'r') as excel_file: rows = excel_to_rows(excel_file, args.excel_sheet) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) else: # print schema print( json.dumps(table_to_schema(config, auth, config.project, args.dataset, args.table), indent=2))