def sdf_to_bigquery(config, auth, sdf_zip_file, project_id, dataset, time_partitioned_table, create_single_day_table, table_suffix=''): with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d: file_names = d.namelist() for file_name in file_names: if config.verbose: print('SDF: Loading: ' + file_name) with d.open(file_name) as sdf_file: rows = csv_to_rows(sdf_file.read().decode('utf-8')) if not rows: if config.verbose: print('SDF: Empty file ' + file_name) continue table_name = file_name.split('.')[0].replace( '-', '_') + table_suffix schema = sdf_schema(next(rows)) # Check if each SDF should have a dated table if create_single_day_table: table_name_dated = table_name + date.today().strftime( '%Y_%m_%d') # Create table and upload data table_create(auth, project_id, dataset, table_name_dated) rows_to_table(config, auth, project_id, dataset, table_name_dated, rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Create end result table if it doesn't already exist if not table_exists(config, auth, project_id, dataset, table_name): table_create(config, auth, project_id, dataset, table_name, is_time_partition=time_partitioned_table) rows_to_table(config, auth, project_id, dataset, table_name, rows, schema=schema, skip_rows=1, disposition='WRITE_APPEND' if time_partitioned_table else 'WRITE_TRUNCATE')
def dt(): jobs = [] if project.verbose: print("DT To BigQuery") # legacy deprecated ( do not use ) if 'path' in project.task: project.task['paths'] = [project.task['path']] # loop all dt files to match pattern or match any pattern print('PATHS', project.task['paths']) for path in (project.task['paths'] or ['']): print(path) for dt_object in object_list(project.task['auth'], '%s:%s' % (project.task['bucket'], path), raw=True): dt_size = dt_object['size'] dt_file = dt_object['name'] dt_time = dt_timestamp(dt_file) dt_partition = dt_file.split('.', 1)[0] if ((project.task.get('days') is None and project.task.get('hours') is None) or (dt_time > project.now - timedelta(days=project.task.get('days', 60), hours=project.task.get('hours', 0)))): if not table_exists(project.task['to']['auth'], project.id, project.task['to']['dataset'], dt_partition): dt_move(dt_object, dt_partition, jobs) else: if project.verbose: print ('DT Partition Exists:', dt_partition) for count, job in enumerate(jobs): print('Waiting For Job: %d of %d' % (count + 1, len(jobs))) job_wait(project.task['to']['auth'], job)
def dcm_log(config, task): if config.verbose: print('DCM LOG') accounts = list(get_rows(config, 'user', task['accounts'])) # determine start log date if table_exists(config, task['out']['auth'], task['out']['project'], task['out']['dataset'], CHANGELOGS_TABLE): start = next( query_to_rows( config, task['out']['auth'], task['out']['project'], task['out']['dataset'], 'SELECT FORMAT_TIMESTAMP("%%Y-%%m-%%dT%%H:%%M:%%S-00:00", MAX(changeTime), "UTC") FROM `%s`' % CHANGELOGS_TABLE, 1, False))[0] disposition = 'WRITE_APPEND' else: start = (datetime.utcnow() - timedelta(days=int(task['days'])) ).strftime('%Y-%m-%dT%H:%M:%S-00:00') disposition = 'WRITE_TRUNCATE' # load new logs rows = get_changelogs(config, task, accounts, start) if rows: rows_to_table(config, task['out']['auth'], task['out']['project'], task['out']['dataset'], CHANGELOGS_TABLE, rows, CHANGELOGS_SCHEMA, 0, disposition)
def handle(self, *args, **kwargs): impact = [ ] #{ 'day': DATE, 'deployment':INT, 'account': INT, 'product': STRING, 'recipe': STRING, 'user': STRING } missing = {} id_max = 0 project.initialize(_service=settings.UI_SERVICE, _verbose=True) if table_exists('service', 'google.com:starthinker', 'dashboard', 'ST_Scripts'): id_max = next( query_to_rows('service', 'google.com:starthinker', 'dashboard', 'SELECT MAX(Deployment) FROM ST_Scripts', legacy=False))[0] for recipe in Recipe.objects.filter( id__gt=id_max).order_by('id')[:kwargs['recipes']]: project.initialize(_user=recipe.account.get_credentials_path(), _service=settings.UI_SERVICE, _verbose=True) values = recipe.get_values() for v in values: if v['tag'] in ('dcm_to_bigquery', 'dcm_to_sheets', 'dcm_to_storage', 'dcm_run', 'conversion_upload_from_bigquery', 'conversion_upload_from_sheets'): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values'].get('account'), 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('dbm_to_bigquery', 'dbm_to_sheets', 'dbm_to_storage'): for partner in account_from_dbm_report( v['values'].get('dbm_report_id'), v['values'].get('dbm_report_name')): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('dt', ): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': account_from_dt(v['values']), 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'barnacle': for account in v['values']['accounts']: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': account, 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('entity', ): for partner in v['values']['partners']: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'itp': impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['dcm_account'], 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['dbm_partner'], 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'itp_audit': impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['cm_account_id'], 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) for partner in account_from_dbm_report( None, v['values'].get('dv360_report_name')): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) else: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': None, 'script': v['tag'], 'product': None, 'user': recipe.account.email.replace('@google.com', '') }) missing.setdefault(v['tag'], 0) missing[v['tag']] += 1 if impact: if kwargs['test']: print(impact) else: print('WRITING TO ST_Scripts') rows_to_table('service', 'google.com:starthinker', 'dashboard', 'ST_Scripts', [(i['day'], i['deployment'], i['user'], i['product'], i['script'], i['account']) for i in impact], schema=[ { 'mode': 'REQUIRED', 'name': 'Day', 'type': 'Date' }, { 'mode': 'REQUIRED', 'name': 'Deployment', 'type': 'INTEGER' }, { 'mode': 'REQUIRED', 'name': 'User', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Product', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Recipe', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Account', 'type': 'INTEGER' }, ], skip_rows=0, disposition='WRITE_TRUNCATE' if id_max == 0 else 'WRITE_APPEND', wait=True) print('MISSING', missing) print('Coverage:', (len(impact) * 100) / (len(missing) + len(impact))) else: print('No recipes newer than:', id_max)