input_file.seek(0) schema = field_list_to_schema(header) output_file_name = '/tmp/%s.csv' % str(uuid.uuid1()) processor.clean_csv(input_file, output_file_name, len(header), header=True) input_file.close() output_file = open(output_file_name, 'rb') io_to_table(project.task['auth'], project.id, project.task['to'].get('dataset'), project.task['to'].get('table'), output_file, 'CSV', schema, skip_rows=0, disposition=project.task['to'].get('write_disposition', 'WRITE_TRUNCATE')) output_file.close() os.remove(input_file_name) os.remove(output_file_name) if __name__ == "__main__": project.load('sftp') sftp()
def conversion_upload(): rows = conversions_download() if project.verbose: print 'CONVERSION UPLOAD' statuses = conversions_upload( project.task['auth'], project.task['account_id'], project.task['activity_id'], project.task['conversion_type'], rows, project.task['encryptionInfo'] ) has_rows = False for status in statuses: has_rows = True if 'errors' in status: if project.verbose: print 'ERROR:', status['conversion']['ordinal'], '\n'.join([e['message'] for e in status['errors']]) else: if project.verbose: print 'OK:', status['conversion']['ordinal'] if not has_rows: if project.verbose: print 'NO ROWS' if __name__ == "__main__": project.load('conversion_upload') conversion_upload()
# # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ########################################################################### from util.project import project from util.google_api import API from util.data import put_rows def google_api(): if project.verbose: print 'GOOGLE_API', project.task['api'], project.task['version'] results = API(project.task).execute() put_rows( project.task['auth'], project.task['out'], '%s_%s.json' % (project.task['function'].replace('.', '_'), project.date), results) if __name__ == "__main__": project.load('google_api') google_api()
"endDate": str(inputs['End Date']), } else: date_range = { "kind": "dfareporting#dateRange", "relativeDateRange": str(inputs['Relative Date Range']) } combos_table = report_combos(unique_name, date_range, inputs['Main Advertiser ID'], inputs['Main Campaign ID'], inputs['Dynamic Profile ID']) main_table = report_main(unique_name, date_range, inputs['Main Advertiser ID'], inputs['Main Campaign ID'], shadow) if shadow: shadow_table = report_shadow(unique_name, date_range, inputs['Shadow Advertiser ID'], inputs['Shadow Campaign ID']) else: shadow_table = None view_combine(unique_name, combos_table, main_table, shadow_table) if __name__ == "__main__": project.load("dynamic_costs") dynamic_costs()
rows = query_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['query']) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # decide which test to run def test(): if 'sheets' in project.task: sheets() elif 'bigquery' in project.task: bigquery() # test should be run like any other task # one test per task ( otherwise it gets confusing ) # calling script already indicates which test is being run # print only PASS or FAIL if __name__ == "__main__": project.load('test') test()
count += 1 # if offers for this account and solution exist, keep only the top ones ( largest impact to lowest ) if offers: offers.sort(key=itemgetter('Impact'), reverse=True) offers = offers[:project.task['offers']] owner['Solutions'].append({'Solution':solution, 'Offers':offers}) if project.verbose: print 'ASSEMBLED OFFERS', count if project.verbose: print 'SENDING OFFERS' # send emails count = 0 for owner in owners: if owner['Solutions']: compose_email_solution_centric(owner) count += 1 if count == 10: exit() if project.verbose: print 'SENT OFFERS', count def marketing(): assemble_offers_solution_centric() if __name__ == "__main__": project.load('marketing') marketing()
if project.verbose and len(triggers) == 0: print "FLOODLIGHT MONITOR: No floodlight ids specified in sheet." alerts = {} day = None for trigger in triggers: # get report data for each floodlight report = floodlight_report(trigger[0]) rows = report_to_rows(report) rows = report_clean(rows) rows = rows_header_trim(rows) rows = rows_to_type(rows, column=6) # calculate outliers last_day, rows = floodlight_analysis(rows) # find last day report ran day = last_day if day is None else max(day, last_day) # group alerts by email alerts.setdefault(trigger[1], []) alerts[trigger[1]].extend(rows) floodlight_email(day, alerts) if __name__ == "__main__": project.load('floodlight_monitor') floodlight_monitor()
# moving a report if 'out' in project.task: filename, report = report_file( project.task['auth'], project.task['report'].get('report_id', None), project.task['report'].get('name', None), project.task['report'].get('timeout', 10), DBM_CHUNKSIZE) # if a report exists if report: if project.verbose: print 'DBM FILE', filename # clean up the report rows = report_to_rows(report) rows = report_clean(rows, datastudio=project.task.get( 'datastudio', False), nulls=True) # write rows using standard out block in json ( allows customization across all scripts ) if rows: put_rows(project.task['auth'], project.task['out'], filename, rows) if __name__ == "__main__": project.load('dbm') dbm()
print "The task name must match a directory with a run.py inside it." print "For example, 'hello' is a task which will executed by 'hello/run.py'." print '' print 'PROJECT JSON:' pprint.PrettyPrinter(depth=20).pprint(project.configuration) print '' print '' print '-' * 80 print "Each task is passed a nested subset of json." print "Different tasks should NOT share json. Security and readability reasons." print "Each task can execute as a service or a user independently." print "Access structure data within a task as..." print '' print 'PROJECT TASK:', project.task print 'PROJECT TASK AUTH:', project.task['auth'] print 'PROJECT TASK SAY:', project.task['say'] print '' print '' print '-' * 80 print "Take a look inside 'hello/run.py'." print "Its a great skeleton for your first project." print '' print '' if __name__ == "__main__": project.load('hello') hello()
rows = report_clean(rows, project.task.get('datastudio', False)) rows = rows_column_add(rows, 'Account_Id', account_id) rows = rows_column_add( rows, 'Account_Name', get_account_name(project.task['auth'], account_id)) # if BigQuery set to append ( storage will automatically file namespace ) if project.task.get('out', {}).get('bigquery', {}).get('table'): project.task['out']['bigquery']['disposition'] = disposition # write rows using standard out block in json ( allows customization across all scripts ) if rows: put_rows(project.task['auth'], project.task['out'], filename, rows) def dcm_bulk(): if project.verbose: print 'DCM BULK' disposition = 'WRITE_TRUNCATE' for count, account in enumerate(project.task['accounts']): if project.verbose: print 'DCM BULK %d of %d' % (count, len(project.task['accounts'])) dcm(account, disposition) disposition = 'WRITE_APPEND' sleep(3) if __name__ == "__main__": project.load('dcm_bulk') dcm_bulk()
# NOT RECOMMENDED: determine schema if missing else: if project.verbose: print 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )' # cast rows to types ( for schema detection ) rows = rows_to_type(rows) rows, schema = get_schema(rows, project.task.get('header', False), infer_type=project.task.get( 'infer_type', True)) # write to table ( not using put because no use cases for other destinations ) rows_to_table( auth=project.task['auth'], project_id=project.id, dataset_id=project.task['out']['bigquery']['dataset'], table_id=project.task['out']['bigquery']['table'], rows=rows, schema=schema, skip_rows=1 if project.task.get('header', False) else 0, disposition=project.task['out']['bigquery'].get( 'disposition', 'WRITE_TRUNCATE')) else: print 'SHEET EMPTY' if __name__ == "__main__": project.load('sheets') sheets()
project.task['auth'], project.task['report']['account'], project.task['report'].get('report_id', None), project.task['report'].get('name', None) or project.task['report'].get('body', {}).get('name', None), project.task['report'].get('timeout', 10), ) if report: if project.verbose: print 'DCM FILE', filename # clean up the report rows = report_to_rows(report) rows = report_clean(rows, project.task.get('datastudio', False)) # if bigquery, remove header and determine schema if 'bigquery' in project.task['out']: schema = report_schema(rows.next()) project.task['out']['bigquery']['schema'] = schema project.task['out']['bigquery']['skip_rows'] = 0 # write rows using standard out block in json ( allows customization across all scripts ) if rows: put_rows(project.task['auth'], project.task['out'], filename, rows) if __name__ == "__main__": project.load('dcm') dcm()
# # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ########################################################################### from util.project import project from util.storage import bucket_create, bucket_access def bucket(): if project.verbose: print "BUCKET", project.task['bucket'] # create bucket bucket_create(project.task['auth'], project.id, project.task['bucket']) bucket_access(project.task['auth'], project.id, project.task['bucket'], emails=project.task.get('emails', []), groups=project.task.get('groups', [])) if __name__ == "__main__": project.load('bucket') bucket()
schema = [] for h in header: h = column_header_sanitize(h) schema.append({ 'name':h, 'type':DT_Field_Lookup.get(h, 'STRING'), 'mode':'NULLABLE' }) return schema def dt(): if project.verbose: print "DT TO TABLE", project.task['to']['table'] storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], '%s:%s*' % (project.task['from']['bucket'], project.task['from']['path']), # append * to match all files with prefix dt_schema(dt_header()), # fetch schema from first dt file 1, 'CSV', 'WRITE_TRUNCATE' ) if __name__ == "__main__": project.load('dt') dt()
if project.verbose: print 'ENTITY:', entity # write public files only once if entity in PUBLIC_FILES: path = 'gdbm-public:entity/%s.0.%s.json' % ( project.date.strftime('%Y%m%d'), entity) schema = Entity_Schema_Lookup[entity] move_entity(project, path, entity, schema, 'WRITE_TRUNCATE') # supports multiple partners, first one resets table, others append else: disposition = 'WRITE_TRUNCATE' for account in get_rows('user', project.task['partners']): #for account in project.task['accounts']: # if advertiser given do not run it ( SAFETY ) if ':' in str(account): print 'WARNING: Skipping advertiser: ', account continue if project.verbose: print 'PARTNER:', account path = 'gdbm-%s:entity/%s.0.%s.json' % ( account, project.date.strftime('%Y%m%d'), entity) schema = Entity_Schema_Lookup[entity] move_entity(project, path, entity, schema, disposition) disposition = 'WRITE_APPEND' if __name__ == '__main__': project.load('entity') entity()
logger.flush() store.save_id_map() except Exception as error: stack = traceback.format_exc() print stack logger.log(str(error)) logger.flush() def test(): """For development purposes when debugging a specific entity, this function is handy to run just that entity. """ setup() init_daos() creatives() if __name__ == '__main__': """Main entry point of Bulkdozer. """ timer.start_timer('bulkdozer job') project.load('traffic') traffic() timer.check_timer('bulkdozer job') #test()
if filename.endswith('.gz'): data = gzip.GzipFile(fileobj=data, mode='rb') filename = filename[:-3] # if excel file, save each sheet individually if filename.endswith('.xlsx'): for sheet, rows in excel_to_rows(data): rows = rows_trim(rows) rows = rows_header_sanitize(rows) if project.verbose: print 'EMAIL WRITE', filename put_rows(project.task['auth'], project.task['out'], filename, rows, column_header_sanitize(sheet)) # if csv, save directly elif filename.endswith('.csv'): rows = csv_to_rows(data) rows = rows_header_sanitize(rows) if project.verbose: print 'EMAIL WRITE', filename put_rows(project.task['auth'], project.task['out'], filename, rows) else: if project.verbose: print 'UNSUPPORTED FILE:', filename if __name__ == "__main__": project.load('email') email()
of role granting and you need it to grant the role to your user. So there is NO SECURITY benefit, just a conveniece. ### UI Security In a UI environment such as a web application, where users DO NOT have access to the service credentials, but the server does have access to user credentials, this handler allows the service to securely grant additional roles to users. ### Good Practice Using roles is a better practice than assigning permissions to user accounts individually because it allows for better tracking and quicker revocation. Changing a role permission changes all user at once without having to track down individuals. We highly recommend using roles. """ from util.project import project from util.auth import set_iam def iam(): set_iam(project.task['auth'], project.id, project.task['role'], project.task['email']) if __name__ == "__main__": project.load('iam') iam()
elif 'line_items' in project.task['read']: line_items = get_rows(project.task['auth'], project.task['read']['line_items']) rows = lineitem_read( project.task['auth'], advertisers, insertion_orders, line_items ) if rows: filename = 'lineitems_%s.csv' % project.date if 'bigquery' in project.task['read']['out']: project.task['read']['out']['bigquery']['schema'] = LineItem_Read_Schema project.task['read']['out']['bigquery']['skip_rows'] = 0 put_rows(project.task['auth'], project.task['read']['out'], filename, rows) elif 'write' in project.task: rows = get_rows(project.task['auth'], project.task['write']) lineitem_write( project.task['auth'], rows, project.task['write'].get('dry_run', True) ) if __name__ == "__main__": project.load('lineitem') lineitem()
# construct query query = 'SELECT\n *,\n' for dimension, tags in dimensions.items(): query += ' CASE\n' for tag, columns in tags.items(): query += ' WHEN ' for column, keywords in columns.items(): for count, keyword in enumerate(keywords): if count != 0: query += 'OR ' query += '%s CONTAINS "%s" ' % (column, keyword) query += 'THEN "%s"\n' % tag query += ' ELSE "%s"\n END AS %s,\n' % (defaults.get( dimension, ''), dimension) query += 'FROM [%s.%s]' % (project.task['in']['dataset'], project.task['in']['table']) if project.verbose: print 'QUERY: ', query # write to view query_to_view(project.task['out']['auth'], project.id, project.task['out']['dataset'], project.task['out']['view'], query, replace=True) if __name__ == "__main__": project.load('mapping') mapping()
if project.verbose: print "QUERY TO SFTP" put_rows(project.task['auth'], project.task['to'], '', rows) else: if project.verbose: print "QUERY TO VIEW", project.task['to']['view'] query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get('legacy', project.task['from'].get( 'useLegacySql', True)) # DEPRECATED: useLegacySql ) else: if project.verbose: print "STORAGE TO TABLE", project.task['to']['table'] storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE')) if __name__ == "__main__": project.load('bigquery') bigquery()
from util.project import project from util.regexp import parse_yyyymmdd from util.storage import object_list, object_move, object_delete def archive(): if project.verbose: print 'ARCHIVE' day = project.date - timedelta(days=abs(project.task['days'])) if 'storage' in project.task: for file_name in object_list(project.task['auth'], project.task['storage']['bucket'] + ':' + project.task['storage']['path'], files_only=True): file_day = parse_yyyymmdd(file_name) if file_day and file_day <= day: if project.task.get('delete', False) == False: if project.verbose: print 'ARCHIVING FILE:', file_name object_move(project.task['auth'], file_name, file_name.replace(':', ':archive/')) else: if project.verbose: print 'DELETING FILE:', file_name object_delete(project.task['auth'], file_name) if __name__ == "__main__": project.load('archive') archive()
See SCOPES in util/auth/__init__.py or review util/auth/README.md Arguments --client / -c - path to client credentials file used to authenticate --user / -u - path to user credentials file to be created if it does not exist. Example python auth/helper.py -u [user credentials path] -c [client credentials path] """ import json import argparse from util.project import project from util.auth import get_profile if __name__ == "__main__": # all parameters come from project ( forces ignore of json file ) parser = argparse.ArgumentParser() # initialize project project.load(parser=parser) # get profile print 'Profile:', json.dumps(get_profile(), indent=2, sort_keys=True)
# # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ########################################################################### from util.project import project from util.bigquery import datasets_create, datasets_access def dataset(): if project.verbose: print "DATASET", project.id, project.task['dataset'] # create dataset datasets_create(project.task['auth'], project.id, project.task['dataset']) datasets_access(project.task['auth'], project.id, project.task['dataset'], emails=project.task.get('emails', []), groups=project.task.get('groups', [])) if __name__ == "__main__": project.load('dataset') dataset()
def twitter(): if project.verbose: print 'TWITTER' rows = None if 'trends' in project.task: if 'places' in project.task['trends']: rows = twitter_trends_places() project.task['out']['bigquery'][ 'schema'] = TWITTER_TRENDS_PLACE_SCHEMA project.task['out']['bigquery']['skip_rows'] = 0 elif 'closest' in project.task['trends']: rows = twitter_trends_closest() project.task['out']['bigquery'][ 'schema'] = TWITTER_TRENDS_CLOSEST_SCHEMA project.task['out']['bigquery']['skip_rows'] = 0 else: rows = twitter_trends_available() project.task['out']['bigquery'][ 'schema'] = TWITTER_TRENDS_AVAILABLE_SCHEMA project.task['out']['bigquery']['skip_rows'] = 0 if rows: put_rows(project.task['auth'], project.task['out'], 'twitter_%s.csv' % project.date, rows) if __name__ == "__main__": project.load('twitter') twitter()
# if a report exists for report in reports: for report_frag in report: if project.verbose: print 'DS FILE', report_frag['name'] # read data and clean up the report # TODO change to fully streaming @jfno rows = report_to_rows( report_read_data(project.task['auth'], report_frag['report_id'], report_frag['report_fragment'])) # upload to cloud if data if rows: #put_rows(project.task['auth'], project.task['out'], report_frag['name'], rows) put_rows(project.task['auth'], project.task['out'], None, rows) def ds(): if project.verbose: print 'DS' if 'report' in project.task: day = project.date - timedelta(days=abs(project.task['days'])) _one_report(day) if __name__ == "__main__": project.load('ds') ds()