def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent("""\ Command line to transform excel sheets into csv files. Prints to STDOUT, user is expected to pipe output into file. Typically used for BigQuery data imports. Examples: List sheets in workbook: python helper.py [EXCEL FILE] --list Convert excel to CSV: python helper.py [EXCEL FILE] --sheet [SHEET NAME] > results.csv """)) parser.add_argument('workbook', help='name of file to pull the rows.') parser.add_argument('--sheet', help='Sheet to pull the rows.', default=None) parser.add_argument('--list', help='List reports.', action='store_true') # initialize project parser = commandline_parser(parser, arguments=('-v')) args = parser.parse_args() config = Configuration( verbose=args.verbose ) with open(args.workbook, 'rb') as excel_file: if args.list: for sheet in excel_to_sheets(excel_file): print(sheet) elif args.sheet: for sheet, row in excel_to_rows(excel_file, args.sheet): print(rows_to_csv(row).read())
def email_read(): # process only most recent message email = get_email_messages( project.task['auth'], project.task['read']['from'], project.task['read']['to'], project.task['read'].get('subject', None), project.task['read'].get('link', None), project.task['read'].get('attachment', None), download=True ) # only take the most recent email try: email = next(email) except: traceback.print_exc() if project.verbose: print('NO EMAILS FOUND') exit() if project.verbose: print('EMAIL:', email['subject']) # loop all attached files for filename, data in email_files(email): if project.verbose: print('EMAIL FILENAME:', filename) # decompress if necessary if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows, column_header_sanitize(sheet)) # if csv, save directly elif filename.endswith('.csv'): rows = csv_to_rows(data.read().decode()) rows = rows_header_sanitize(rows) if project.verbose: print('EMAIL WRITE', filename) put_rows(project.task['auth'], project.task['read']['out'], rows) else: if project.verbose: print('UNSUPPORTED FILE:', filename)
def email_read(): # process only most recent message try: message = next( get_email_messages(project.task['auth'], project.task['read']['from'], project.task['read']['to'], project.task['read'].get('subject', None))) except StopIteration as e: if project.verbose: print('NO EMAILS FOUND') exit() if project.verbose: print('EMAIL:', get_subject(message)) files = [] if project.task['read'].get('attachment'): files.extend( get_email_attachments(project.task['auth'], message, project.task['read']['attachment'])) if project.task['read'].get('link'): files.extend( get_email_links(project.task['auth'], message, project.task['read']['link'], download=True)) for filename, data in files: if project.verbose: print('EMAIL FILENAME:', filename) # decompress if necessary if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) put_rows(project.task['auth'], project.task['read']['out'], rows, sheet) # if CM report elif project.task['read']['from'] == '*****@*****.**': rows = cm_report_to_rows(data.read().decode()) rows = cm_report_clean(rows) # if bigquery, remove header and determine schema schema = None if 'bigquery' in project.task['read']['out']: schema = cm_report_schema(next(rows)) project.task['read']['out']['bigquery']['schema'] = schema project.task['read']['out']['bigquery']['skip_rows'] = 1 put_rows(project.task['auth'], project.task['read']['out'], rows) # if dv360 report elif project.task['read']['from'] == '*****@*****.**': rows = dv360_report_to_rows(data.getvalue().decode()) rows = dv360_report_clean(rows) put_rows(project.task['auth'], project.task['read']['out'], rows) # if csv elif filename.endswith('.csv'): rows = csv_to_rows(data.read().decode()) rows = rows_header_sanitize(rows) put_rows(project.task['auth'], project.task['read']['out'], rows) else: if project.verbose: print('UNSUPPORTED FILE:', filename)
def main(): # get parameters parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent("""\ Command line to get table schema from BigQuery. Helps developers upload data to BigQuery and pull schemas. These are the most common BigQuery tasks when developing solutions. Examples: Display table schema: `python helper.py --project [id] --dataset [name] --table [name] -s [credentials]` Upload csv table: `python helper.py --project [id] --dataset [name] --table [name] --csv [file] --schema [file] -s [credentials]` Upload excel sheet: `python helper.py --project [id] --dataset [name] --table [name] --excel_file [file] --excel_sheet [name] --schema [file] -s [credentials]` """)) parser.add_argument('--dataset', help='name of BigQuery dataset', default=None) parser.add_argument('--table', help='name of BigQuery table', default=None) parser.add_argument('--csv', help='CSV file path', default=None) parser.add_argument('--schema', help='SCHEMA file path', default=None) parser.add_argument('--excel_workbook', help='Excel file path', default=None) parser.add_argument('--excel_sheet', help='Excel sheet name', default=None) # initialize project parser = commandline_parser(parser, arguments=('-u', '-c', '-s', '-v', '-p')) args = parser.parse_args() config = Configuration(user=args.user, client=args.client, service=args.service, verbose=args.verbose, project=args.project) auth = 'service' if args.service else 'user' schema = json.loads(args.schema) if args.schema else None if args.csv: with open(args.csv, 'r') as csv_file: rows = csv_to_rows(csv_file.read()) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) elif args.excel_workbook and args.excel_sheet: with open(args.excel_workbook, 'r') as excel_file: rows = excel_to_rows(excel_file, args.excel_sheet) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) else: # print schema print( json.dumps(table_to_schema(config, auth, config.project, args.dataset, args.table), indent=2))