Exemplos de rows_to_table em Python, exemplos de starthinker.util.bigquery.rows_to_table em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: run.py Projeto: Gregorfran/starthinker-gregor

def write_report(report, dataset, table):
  # turn report file into rows
  rows = report_to_rows(report)
  rows = report_clean(rows)

  if rows:
    if project.verbose: print("DYNAMIC COSTS WRITTEN:", table)

    # pull DCM schema automatically
    try:
      schema = report_schema(next(rows))
    except StopIteration: # report is empty
      raise ValueError("REPORT DID NOT RUN")

    # write report to bigquery
    rows_to_table(
      project.task['out']["auth"],
      project.id,
      project.task['out']["dataset"],
      table,
      rows,
      schema,
      0
    )

  else:
    if project.verbose: print("DYNAMIC COSTS REPORT NOT READY:", table)

Exemplo n.º 2

0

Exibir arquivo

def create_cm_site_segmentation(config, task):
  # Read sheet to bq table
  sheet_rows = sheets_read(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A:C', retries=10)
  if not sheet_rows:
    sheet_rows = []

  schema = [
    { "type": "STRING", "name": "Site_Dcm", "mode": "NULLABLE" },
    { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" },
    { "type": "STRING", "name": "Site_Type", "mode": "NULLABLE" }
  ]

  rows_to_table(
    config,
    auth=task['auth_bq'],
    project_id=config.project,
    dataset_id=task['dataset'],
    table_id=CM_SITE_SEGMENTATION_SHEET_TABLE,
    rows=sheet_rows,
    schema=schema,
    skip_rows=1,
    disposition='WRITE_TRUNCATE'
  )

  # Get Site_Type from the sheet
  run_query_from_file(config, task, Queries.cm_site_segmentation.replace('{{dataset}}', task['dataset']), CM_SITE_SEGMENTATION_TABLE)

  # Move Table back to sheets
  query = 'SELECT * from `' + config.project + '.' + task['dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`'
  rows = query_to_rows(config, task['auth_bq'], config.project, task['dataset'], query, legacy=False)

  # makes sure types are correct in sheet
  rows = rows_to_type(rows)
  sheets_clear(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A2:C')
  sheets_write(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A2:C', rows)

Exemplo n.º 3

0

Exibir arquivo

def anonymize_table(table_id):

    if project.verbose:
        print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'],
              table_id)

    schema = API_BigQuery(project.task['auth']).tables().get(
        projectId=project.task['bigquery']['from']['project'],
        datasetId=project.task['bigquery']['from']['dataset'],
        tableId=table_id).execute()['schema']['fields']

    rows = table_to_rows(project.task['auth'],
                         project.task['bigquery']['from']['project'],
                         project.task['bigquery']['from']['dataset'], table_id)

    rows = anonymize_rows(rows, schema)

    rows_to_table(project.task['auth'],
                  project.task['bigquery']['to']['project'],
                  project.task['bigquery']['to']['dataset'],
                  table_id,
                  rows,
                  schema,
                  skip_rows=0,
                  disposition='WRITE_TRUNCATE')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: run.py Projeto: google/starthinker

def dcm_log(config, task):
    if config.verbose:
        print('DCM LOG')

    accounts = list(get_rows(config, 'user', task['accounts']))

    # determine start log date
    if table_exists(config, task['out']['auth'], task['out']['project'],
                    task['out']['dataset'], CHANGELOGS_TABLE):
        start = next(
            query_to_rows(
                config, task['out']['auth'], task['out']['project'],
                task['out']['dataset'],
                'SELECT FORMAT_TIMESTAMP("%%Y-%%m-%%dT%%H:%%M:%%S-00:00", MAX(changeTime), "UTC") FROM `%s`'
                % CHANGELOGS_TABLE, 1, False))[0]
        disposition = 'WRITE_APPEND'

    else:
        start = (datetime.utcnow() - timedelta(days=int(task['days']))
                 ).strftime('%Y-%m-%dT%H:%M:%S-00:00')
        disposition = 'WRITE_TRUNCATE'

    # load new logs
    rows = get_changelogs(config, task, accounts, start)
    if rows:
        rows_to_table(config, task['out']['auth'], task['out']['project'],
                      task['out']['dataset'], CHANGELOGS_TABLE, rows,
                      CHANGELOGS_SCHEMA, 0, disposition)

Exemplo n.º 5

0

Exibir arquivo

def sdf_to_bigquery(config,
                    auth,
                    sdf_zip_file,
                    project_id,
                    dataset,
                    time_partitioned_table,
                    create_single_day_table,
                    table_suffix=''):
    with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d:
        file_names = d.namelist()
        for file_name in file_names:
            if config.verbose:
                print('SDF: Loading: ' + file_name)
            with d.open(file_name) as sdf_file:
                rows = csv_to_rows(sdf_file.read().decode('utf-8'))
                if not rows:
                    if config.verbose:
                        print('SDF: Empty file ' + file_name)
                    continue
                table_name = file_name.split('.')[0].replace(
                    '-', '_') + table_suffix
                schema = sdf_schema(next(rows))

                # Check if each SDF should have a dated table
                if create_single_day_table:
                    table_name_dated = table_name + date.today().strftime(
                        '%Y_%m_%d')

                    # Create table and upload data
                    table_create(auth, project_id, dataset, table_name_dated)
                    rows_to_table(config,
                                  auth,
                                  project_id,
                                  dataset,
                                  table_name_dated,
                                  rows,
                                  schema=schema,
                                  skip_rows=1,
                                  disposition='WRITE_TRUNCATE')

                # Create end result table if it doesn't already exist
                if not table_exists(config, auth, project_id, dataset,
                                    table_name):
                    table_create(config,
                                 auth,
                                 project_id,
                                 dataset,
                                 table_name,
                                 is_time_partition=time_partitioned_table)

                rows_to_table(config,
                              auth,
                              project_id,
                              dataset,
                              table_name,
                              rows,
                              schema=schema,
                              skip_rows=1,
                              disposition='WRITE_APPEND'
                              if time_partitioned_table else 'WRITE_TRUNCATE')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: run.py Projeto: quesadillasraclette/starthinker

def sov_dbm():
    # 1 - creat one report for client and oen report for peers
    client_name, peer_name = sov_create_reports()

    # 2 - Download the reports and aggregate into one table
    sov_rows = sov_process_client(client_name)
    sov_rows.extend(sov_process_peer(peer_name))

    # 3 - Save the report to the specified table
    rows_to_table(project.task['auth'], project.id, project.task['dataset'],
                  project.task['table'], sov_rows, SCHEMA, 0)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: run.py Projeto: hernanperalta/starthinker

def create_cm_site_segmentation(project):
    # Read sheet to bq table
    sheet_rows = sheets_read('user',
                             project.task['sheet'],
                             'CM_Site_Segments',
                             'A:C',
                             retries=10)
    if not sheet_rows:
        sheet_rows = []

    schema = [{
        "type": "STRING",
        "name": "Site_Dcm",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Impressions",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Site_Type",
        "mode": "NULLABLE"
    }]

    rows_to_table(auth='service',
                  project_id=project.id,
                  dataset_id=project.task['dataset'],
                  table_id=CM_SITE_SEGMENTATION_SHEET_TABLE,
                  rows=sheet_rows,
                  schema=schema,
                  skip_rows=1,
                  disposition='WRITE_TRUNCATE')

    # Get Site_Type from the sheet
    run_query_from_file(
        os.path.join(os.path.dirname(__file__),
                     SQL_DIRECTORY + CM_SITE_SEGMENTATION_FILENAME),
        project.id, project.task['dataset'], CM_SITE_SEGMENTATION_TABLE)

    # Move Table back to sheets
    query = 'SELECT * from `' + project.id + '.' + project.task[
        'dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`'
    rows = query_to_rows('service',
                         project.id,
                         project.task['dataset'],
                         query,
                         legacy=False)

    # makes sure types are correct in sheet
    rows = rows_to_type(rows)
    sheets_clear('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C')
    sheets_write('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C',
                 rows)

Exemplo n.º 8

0

Exibir arquivo

def create_dv360_segments(config, task):
  a1_notation = 'A:N'
  schema = [
      { "type": "STRING", "name": "Advertiser", "mode": "NULLABLE" },
      { "type": "INTEGER", "name": "Advertiser_Id", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Campaign", "mode": "NULLABLE" },
      { "type": "INTEGER", "name": "Campaign_Id", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Insertion_Order", "mode": "NULLABLE" },
      { "type": "INTEGER", "name": "Insertion_Order_Id", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Line_Item", "mode": "NULLABLE" },
      { "type": "INTEGER", "name": "Line_Item_Id", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Line_Item_Type", "mode": "NULLABLE" },
      { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" },
      { "type": "STRING", "name": "SegmentAutoGen", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Segment1", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Segment2", "mode": "NULLABLE" },
      { "type": "STRING", "name": "Segment3", "mode": "NULLABLE" }
    ]

  sheet_rows = sheets_read(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation, retries=10)

  if not sheet_rows:
    sheet_rows = []

  print('DV360 SEGMENT SHEET TABLE WRITE')

  rows_to_table(
    config,
    auth=task['auth_bq'],
    project_id=config.project,
    dataset_id=task['dataset'],
    table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE,
    rows=sheet_rows,
    schema=schema,
    skip_rows=1,
    disposition='WRITE_TRUNCATE'
    )

  # Run Query
  if config.verbose:
    print('RUN DV360 Custom Segments Query')
  run_query_from_file(config, task, Queries.dv360_custom_segments.replace('{{dataset}}', task['dataset']), DV360_CUSTOM_SEGMENTS_TABLE)

  # Move Table back to sheets
  query = 'SELECT * from `' + config.project + '.' + task['dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`'
  rows = query_to_rows(config, task['auth_bq'], config.project, task['dataset'], query, legacy=False)

  # makes sure types are correct in sheet
  a1_notation = a1_notation[:1] + '2' + a1_notation[1:]
  rows = rows_to_type(rows)
  sheets_clear(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation)
  sheets_write(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation, rows)

Exemplo n.º 9

0

Exibir arquivo

def bigquery_values(config, task):
    """Write explicit values to a table.

  TODO: Replace with get_rows.

  See: scripts/bigquery_run_query.json
  """

    if config.verbose:
        print('VALUES', task['from']['values'])

    rows = data.get_rows(config, task['auth'], task['from'])
    bq.rows_to_table(config, task['to'].get('auth',
                                            task['auth']), config.project,
                     task['to']['dataset'], task['to']['table'], rows,
                     task.get('schema', []), 0)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: run.py Projeto: fivestones-apac/starthinker

def bigquery_values():
    """Write explicit values to a table.

  TODO: Replace with get_rows.

  See: scripts/bigquery_run_query.json
  """

    if project.verbose:
        print('VALUES', project.task['from']['values'])

    rows = get_rows(project.task['auth'], project.task['from'])
    rows_to_table(project.task['to'].get('auth', project.task['auth']),
                  project.id, project.task['to']['dataset'],
                  project.task['to']['table'], rows,
                  project.task.get('schema', []), 0)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: run.py Projeto: fivestones-apac/starthinker

def create_cm_site_segmentation(project):
  # Read sheet to bq table
  sheet_rows = sheets_read(
      'user', project.task['sheet'], 'CM_Site_Segments', 'A:C', retries=10)
  if not sheet_rows:
    sheet_rows = []

  schema = [{
      'type': 'STRING',
      'name': 'Site_Dcm',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Impressions',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Site_Type',
      'mode': 'NULLABLE'
  }]

  rows_to_table(
      auth='service',
      project_id=project.id,
      dataset_id=project.task['dataset'],
      table_id=CM_SITE_SEGMENTATION_SHEET_TABLE,
      rows=sheet_rows,
      schema=schema,
      skip_rows=1,
      disposition='WRITE_TRUNCATE')

  # Get Site_Type from the sheet
  run_query_from_file(Queries.cm_site_segmentation, CM_SITE_SEGMENTATION_TABLE)

  # Move Table back to sheets
  query = 'SELECT * from `' + project.id + '.' + project.task[
      'dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`'
  rows = query_to_rows(
      'service', project.id, project.task['dataset'], query, legacy=False)

  # makes sure types are correct in sheet
  rows = rows_to_type(rows)
  sheets_clear('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C')
  sheets_write('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C', rows)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: run.py Projeto: hernanperalta/starthinker

def monthly_budget_mover():
	if project.verbose: print('MONTHLY BUDGET MOVER')

	report = report_to_list(project.task['auth'], project.task['spend_report_id'])

	categories = remove_excluded_ios_from_categories(project.task['budget_categories'], project.task['excluded_ios'])

	categories_spend = aggregate_io_spend_to_categories(report, categories)

	sdf = list(sdf_read(project.task['auth'], project.task['sdf']['file_types'].split(','), project.task['sdf']['filter_type'], project.task['sdf']['filter_ids']))

	categories_budget = aggregate_io_budget_to_categories(sdf, categories)

	category_budget_deltas = calc_budget_spend_deltas(categories_budget, categories_spend, categories)

	new_sdf,changes = apply_category_budgets(sdf, category_budget_deltas, categories)

	# Set Schemas for BQ tables
	schema = make_schema(new_sdf[0])
	schema_changes = make_schema(['Io Id','Category','Old Value','Delta','New Value'])
	
	# Write old sdf to table
	rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['old_sdf_table_name'], sdf, schema, skip_rows=1, disposition='WRITE_TRUNCATE')
	
	# Write new sdf to table
	rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['new_sdf_table_name'], new_sdf, schema, skip_rows=1, disposition='WRITE_TRUNCATE')
	
	# Write log file to table
	rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['changes_table_name'], iter(changes), schema_changes, skip_rows=0, disposition='WRITE_TRUNCATE')

Exemplo n.º 13

0

Exibir arquivo

def write_report(config, task, report, dataset, table):
    # turn report file into rows
    rows = report_to_rows(report)
    rows = report_clean(rows)

    if rows:
        if config.verbose:
            print('DYNAMIC COSTS WRITTEN:', table)

        # pull DCM schema automatically
        try:
            schema = report_schema(next(rows))
        except StopIteration:  # report is empty
            raise ValueError('REPORT DID NOT RUN')

        # write report to bigquery
        rows_to_table(config, task['out']['auth'], config.project,
                      task['out']['dataset'], table, rows, schema, 0)

    else:
        if config.verbose:
            print('DYNAMIC COSTS REPORT NOT READY:', table)

Exemplo n.º 14

0

Exibir arquivo

def anonymize_table(table_id):

    if project.verbose:
        print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'],
              table_id)

    schema = API_BigQuery(project.task['auth']).tables().get(
        projectId=project.task['bigquery']['from']['project'],
        datasetId=project.task['bigquery']['from']['dataset'],
        tableId=table_id).execute()['schema']['fields']

    is_object = any([s['type'] == 'RECORD' for s in schema])

    rows = table_to_rows(project.task['auth'],
                         project.task['bigquery']['from']['project'],
                         project.task['bigquery']['from']['dataset'],
                         table_id,
                         as_object=is_object)

    rows = anonymize_rows(rows, schema, is_object)

    if is_object:
        json_to_table(project.task['auth'],
                      project.task['bigquery']['to']['project'],
                      project.task['bigquery']['to']['dataset'],
                      table_id,
                      rows,
                      schema,
                      disposition='WRITE_TRUNCATE')
    else:
        rows_to_table(project.task['auth'],
                      project.task['bigquery']['to']['project'],
                      project.task['bigquery']['to']['dataset'],
                      table_id,
                      rows,
                      schema,
                      skip_rows=0,
                      disposition='WRITE_TRUNCATE')

Exemplo n.º 15

0

Exibir arquivo

Arquivo: run.py Projeto: fivestones-apac/starthinker

def create_dv360_segments(project):
  a1_notation = 'A:M'
  schema = [{
      'type': 'STRING',
      'name': 'Advertiser',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Advertiser_Id',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Campaign',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Campaign_Id',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Insertion_Order',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Insertion_Order_Id',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Line_Item',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Line_Item_Id',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Line_Item_Type',
      'mode': 'NULLABLE'
  }, {
      'type': 'INTEGER',
      'name': 'Impressions',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Segment1',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Segment2',
      'mode': 'NULLABLE'
  }, {
      'type': 'STRING',
      'name': 'Segment3',
      'mode': 'NULLABLE'
  }]

  sheet_rows = sheets_read(
      'user', project.task['sheet'], 'DV3 Segments', a1_notation, retries=10)

  if not sheet_rows:
    sheet_rows = []

  print('DV360 SEGMENT SHEET TABLE WRITE')

  rows_to_table(
      auth='service',
      project_id=project.id,
      dataset_id=project.task['dataset'],
      table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE,
      rows=sheet_rows,
      schema=schema,
      skip_rows=1,
      disposition='WRITE_TRUNCATE')

  # Run Query
  if project.verbose:
    print('RUN DV360 Custom Segments Query')
  run_query_from_file(Queries.dv360_custom_segments, DV360_CUSTOM_SEGMENTS_TABLE)

  # Move Table back to sheets
  query = 'SELECT * from `' + project.id + '.' + project.task[
      'dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`'
  rows = query_to_rows(
      'service', project.id, project.task['dataset'], query, legacy=False)

  # makes sure types are correct in sheet
  a1_notation = a1_notation[:1] + '2' + a1_notation[1:]
  rows = rows_to_type(rows)
  sheets_clear('user', project.task['sheet'], 'DV3 Segments', a1_notation)
  sheets_write('user', project.task['sheet'], 'DV3 Segments', a1_notation, rows)

Exemplo n.º 16

0

Exibir arquivo

def bigquery():

    if 'function' in project.task:
        query = None

        if project.task['function'] == 'pearson_significance_test':
            query = pearson_significance_test()

        if query:
            run_query(project.task['auth'], project.id, query, False,
                      project.task['to']['dataset'])

    elif 'run' in project.task and 'query' in project.task.get('run', {}):
        if project.verbose:
            print('QUERY', project.task['run']['query'])
        run_query(
            project.task['auth'],
            project.id,
            query_parameters(project.task['run']['query'],
                             project.task['run'].get('parameters')),
            project.task['run'].get('legacy', True),
        )

    elif 'values' in project.task['from']:
        rows = get_rows(project.task['auth'], project.task['from'])

        rows_to_table(project.task['to'].get('auth', project.task['auth']),
                      project.id, project.task['to']['dataset'],
                      project.task['to']['table'], rows,
                      project.task.get('schema', []), 0)

    elif 'query' in project.task['from']:

        if 'table' in project.task['to']:
            if project.verbose:
                print('QUERY TO TABLE', project.task['to']['table'])

            query_to_table(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['table'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                disposition=project.task['write_disposition']
                if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
                legacy=project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql,
                target_project_id=project.task['to'].get(
                    'project_id', project.id))

        elif 'sheet' in project.task['to']:
            if project.verbose:
                print('QUERY TO SHEET', project.task['to']['sheet'])
            rows = query_to_rows(
                project.task['auth'],
                project.id,
                project.task['from']['dataset'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                legacy=project.task['from'].get('legacy', True))

            # makes sure types are correct in sheet
            rows = rows_to_type(rows)

            sheets_clear(project.task['to'].get('auth', project.task['auth']),
                         project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'))
            sheets_write(project.task['to'].get('auth', project.task['auth']),
                         project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'), rows)

        elif 'sftp' in project.task['to']:
            rows = query_to_rows(
                project.task['auth'],
                project.id,
                project.task['from']['dataset'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                legacy=project.task['from'].get('use_legacy_sql', True))

            if rows:
                if project.verbose:
                    print('QUERY TO SFTP')
                put_rows(project.task['auth'], project.task['to'], rows)

        else:
            if project.verbose:
                print('QUERY TO VIEW', project.task['to']['view'])
            query_to_view(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['view'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql
                project.task['to'].get('replace', False))

    else:
        if project.verbose:
            print('STORAGE TO TABLE', project.task['to']['table'])
        storage_to_table(
            project.task['auth'], project.id, project.task['to']['dataset'],
            project.task['to']['table'], project.task['from']['bucket'] + ':' +
            project.task['from']['path'], project.task.get('schema', []),
            project.task.get('skip_rows', 1),
            project.task.get('structure', 'CSV'),
            project.task.get('disposition', 'WRITE_TRUNCATE'))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: run.py Projeto: RMStanford/starthinker

def bigquery():

    if 'run' in project.task and 'query' in project.task.get('run', {}):
        if project.verbose: print("QUERY", project.task['run']['query'])
        run_query(
            project.task['auth'],
            project.id,
            project.task['run']['query'],
            project.task['run'].get('legacy', True),
            #project.task['run'].get('billing_project_id', None)
        )

    elif 'values' in project.task['from']:
        rows = get_rows(project.task['auth'], project.task['from'])

        rows_to_table(project.task['to'].get('auth', project.task['auth']),
                      project.id, project.task['to']['dataset'],
                      project.task['to']['table'], rows,
                      project.task.get('schema', []), 0)

    elif 'query' in project.task['from']:
        if 'table' in project.task['to']:
            if project.verbose:
                print("QUERY TO TABLE", project.task['to']['table'])

            if 'pre_process_query' in project.task['to']:
                print('executing statement')
                execute_statement(project.task['auth'],
                                  project.id,
                                  project.task['to']['dataset'],
                                  project.task['to']['pre_process_query'],
                                  use_legacy_sql=project.task['from'].get(
                                      'legacy', project.task['from'].get(
                                          'useLegacySql', True)))
            query_to_table(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['table'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                disposition=project.task['write_disposition']
                if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
                legacy=project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql,
                target_project_id=project.task['to'].get(
                    'project_id', project.id))
        # NOT USED SO RIPPING IT OUT
        # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json
        elif 'storage' in project.task['to']:
            if project.verbose:
                print("QUERY TO STORAGE", project.task['to']['storage'])
            local_file_name = '/tmp/%s' % str(uuid.uuid1())
            rows = query_to_rows(project.task['auth'], project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'])

            f = open(local_file_name, 'wb')
            writer = csv.writer(f)
            writer.writerows(rows)
            f.close()

            f = open(local_file_name, 'rb')
            object_put(project.task['auth'], project.task['to']['storage'], f)
            f.close()

            os.remove(local_file_name)
        elif 'sheet' in project.task['to']:
            if project.verbose:
                print("QUERY TO SHEET", project.task['to']['sheet'])
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'legacy', True))

            # makes sure types are correct in sheet
            rows = rows_to_type(rows)

            sheets_clear(project.task['auth'], project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'))
            sheets_write(project.task['auth'], project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'), rows)
        elif 'sftp' in project.task['to']:
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'use_legacy_sql', True))

            if rows:
                if project.verbose: print("QUERY TO SFTP")
                put_rows(project.task['auth'], project.task['to'], rows)
        else:
            if project.verbose:
                print("QUERY TO VIEW", project.task['to']['view'])
            query_to_view(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['view'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql
                project.task['to'].get('replace', False))
    else:
        if project.verbose:
            print("STORAGE TO TABLE", project.task['to']['table'])
        storage_to_table(
            project.task['auth'], project.id, project.task['to']['dataset'],
            project.task['to']['table'], project.task['from']['bucket'] + ':' +
            project.task['from']['path'], project.task.get('schema', []),
            project.task.get('skip_rows', 1),
            project.task.get('structure', 'CSV'),
            project.task.get('disposition', 'WRITE_TRUNCATE'))

Exemplo n.º 18

0

Exibir arquivo

def put_rows(auth, destination, rows, variant=''):
    """Processes standard write JSON block for dynamic export of data.

  Allows us to quickly write the results of a script to a destination.  For
  example
  write the results of a DCM report into BigQuery.

  - Will write to multiple destinations if specified.
  - Extensible, add a handler to define a new destination ( be kind update the
  documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.

    from utils.data import put_rows

    var_json = {
      "out":{
        "bigquery":{
          "dataset": [ string ],
          "table": [ string ]
          "schema": [ json - standard bigquery schema json ],
          "skip_rows": [ integer - for removing header ]
          "disposition": [ string - same as BigQuery documentation ]
        },
        "sheets":{
          "sheet":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
          "delete": [ boolean - if sheet range should be cleared before writing
          ]
        },
        "storage":{
          "bucket": [ string ],
          "path": [ string ]
        },
        "file":[ string - full path to place to write file ]
      }
    }

    values = put_rows('user', var_json)

  Or you can use it directly with project singleton.

    from util.project import project
    from utils.data import put_rows

    @project.from_parameters
    def something():
      values = get_rows(project.task['auth'], project.task['out'])

    if __name__ == "__main__":
      something()

  Args:
    auth: (string) The type of authentication to use, user or service.
    destination: (json) A json block resembling var_json described above. rows (
      list ) The data being written as a list object. variant (string) Appended
      to destination to differentieate multiple objects

  Returns:
    If single_cell is False: Returns a list of row values [[v1], [v2], ... ]
    If single_cell is True: Returns a list of values [v1, v2, ...]
"""

    if 'bigquery' in destination:

        if destination['bigquery'].get('format', 'CSV') == 'JSON':
            json_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
            )

        elif destination['bigquery'].get('is_incremental_load', False) == True:
            incremental_rows_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get(
                    'skip_rows',
                    1),  #0 if 'schema' in destination['bigquery'] else 1),
                destination['bigquery'].get('disposition', 'WRITE_APPEND'),
                billing_project_id=project.id)

        else:
            rows_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get(
                    'skip_rows',
                    1),  #0 if 'schema' in destination['bigquery'] else 1),
                destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
            )

    if 'sheets' in destination:
        if destination['sheets'].get('delete', False):
            sheets_clear(
                auth,
                destination['sheets']['sheet'],
                destination['sheets']['tab'] + variant,
                destination['sheets']['range'],
            )

        sheets_write(auth, destination['sheets']['sheet'],
                     destination['sheets']['tab'] + variant,
                     destination['sheets']['range'], rows)

    if 'file' in destination:
        path_out, file_ext = destination['file'].rsplit('.', 1)
        file_out = path_out + variant + '.' + file_ext
        if project.verbose:
            print('SAVING', file_out)
        makedirs_safe(parse_path(file_out))
        with open(file_out, 'w') as save_file:
            save_file.write(rows_to_csv(rows).read())

    if 'storage' in destination and destination['storage'].get(
            'bucket') and destination['storage'].get('path'):
        # create the bucket
        bucket_create(auth, project.id, destination['storage']['bucket'])

        # put the file
        file_out = destination['storage']['bucket'] + ':' + destination[
            'storage']['path'] + variant
        if project.verbose:
            print('SAVING', file_out)
        object_put(auth, file_out, rows_to_csv(rows))

    if 'sftp' in destination:
        try:
            cnopts = pysftp.CnOpts()
            cnopts.hostkeys = None

            path_out, file_out = destination['sftp']['file'].rsplit('.', 1)
            file_out = path_out + variant + file_out

            sftp = pysftp.Connection(host=destination['sftp']['host'],
                                     username=destination['sftp']['username'],
                                     password=destination['sftp']['password'],
                                     port=destination['sftp']['port'],
                                     cnopts=cnopts)

            if '/' in file_out:
                dir_out, file_out = file_out.rsplit('/', 1)
                sftp.cwd(dir_out)

            sftp.putfo(rows_to_csv(rows), file_out)

        except e:
            print(str(e))
            traceback.print_exc()

Exemplo n.º 19

0

Exibir arquivo

    def handle(self, *args, **kwargs):

        impact = [
        ]  #{ 'day': DATE, 'deployment':INT, 'account': INT, 'product': STRING, 'recipe': STRING, 'user': STRING }
        missing = {}
        id_max = 0

        project.initialize(_service=settings.UI_SERVICE, _verbose=True)

        if table_exists('service', 'google.com:starthinker', 'dashboard',
                        'ST_Scripts'):
            id_max = next(
                query_to_rows('service',
                              'google.com:starthinker',
                              'dashboard',
                              'SELECT MAX(Deployment) FROM ST_Scripts',
                              legacy=False))[0]

        for recipe in Recipe.objects.filter(
                id__gt=id_max).order_by('id')[:kwargs['recipes']]:

            project.initialize(_user=recipe.account.get_credentials_path(),
                               _service=settings.UI_SERVICE,
                               _verbose=True)

            values = recipe.get_values()
            for v in values:
                if v['tag'] in ('dcm_to_bigquery', 'dcm_to_sheets',
                                'dcm_to_storage', 'dcm_run',
                                'conversion_upload_from_bigquery',
                                'conversion_upload_from_sheets'):
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values'].get('account'),
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] in ('dbm_to_bigquery', 'dbm_to_sheets',
                                  'dbm_to_storage'):
                    for partner in account_from_dbm_report(
                            v['values'].get('dbm_report_id'),
                            v['values'].get('dbm_report_name')):
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] in ('dt', ):
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        account_from_dt(v['values']),
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] == 'barnacle':
                    for account in v['values']['accounts']:
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            account,
                            'script':
                            v['tag'],
                            'product':
                            'dcm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] in ('entity', ):
                    for partner in v['values']['partners']:
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] == 'itp':
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['dcm_account'],
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['dbm_partner'],
                        'script':
                        v['tag'],
                        'product':
                        'dbm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] == 'itp_audit':
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['cm_account_id'],
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    for partner in account_from_dbm_report(
                            None, v['values'].get('dv360_report_name')):
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                else:
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        None,
                        'script':
                        v['tag'],
                        'product':
                        None,
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    missing.setdefault(v['tag'], 0)
                    missing[v['tag']] += 1

        if impact:
            if kwargs['test']:
                print(impact)
            else:

                print('WRITING TO ST_Scripts')
                rows_to_table('service',
                              'google.com:starthinker',
                              'dashboard',
                              'ST_Scripts',
                              [(i['day'], i['deployment'], i['user'],
                                i['product'], i['script'], i['account'])
                               for i in impact],
                              schema=[
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'Day',
                                      'type': 'Date'
                                  },
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'Deployment',
                                      'type': 'INTEGER'
                                  },
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'User',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Product',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Recipe',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Account',
                                      'type': 'INTEGER'
                                  },
                              ],
                              skip_rows=0,
                              disposition='WRITE_TRUNCATE'
                              if id_max == 0 else 'WRITE_APPEND',
                              wait=True)

            print('MISSING', missing)
            print('Coverage:',
                  (len(impact) * 100) / (len(missing) + len(impact)))
        else:
            print('No recipes newer than:', id_max)

Exemplo n.º 20

0

Exibir arquivo

def put_rows(auth, destination, filename, rows, variant=''):
    """Processes standard write JSON block for dynamic export of data.
  
  Allows us to quickly write the results of a script to a destination.  For example
  write the results of a DCM report into BigQuery.

  - Will write to multiple destinations if specified.
  - Extensible, add a handler to define a new destination ( be kind update the documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.
  
    from utils.data import put_rows
  
    var_json = {
      "out":{
        "bigquery":{
          "dataset": [ string ],
          "table": [ string ]
          "schema": [ json - standard bigquery schema json ],
          "skip_rows": [ integer - for removing header ]
          "disposition": [ string - same as BigQuery documentation ]
        },
        "sheets":{
          "url":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
          "delete": [ boolean - if sheet range should be cleared before writing ]
        },
        "storage":{
          "bucket": [ string ],
          "path": [ string ]
        },
        "directory":[ string - full path to place to write file ]
      } 
    } 
  
    values = put_rows('user', var_json)
  
  Or you can use it directly with project singleton.
  
    from util.project import project
    from utils.data import put_rows
  
    @project.from_parameters
    def something():
      values = get_rows(project.task['auth'], project.task['out'])
  
    if __name__ == "__main__":
      something()
  
  Args:
    auth: (string) The type of authentication to use, user or service.
    destination: (json) A json block resembling var_json described above.
    filename: (string) A unique filename if writing to medium requiring one, Usually gnerated by script.
    rows ( list ) The data being written as a list object.
    variant ( string ) Appends this to the destination name to create a variant ( for example when downloading multiple tabs in a sheet ).

  Returns:
    If single_cell is False: Returns a list of row values [[v1], [v2], ... ]
    If single_cell is True: Returns a list of values [v1, v2, ...]
"""

    if 'bigquery' in destination:

        if destination['bigquery'].get('format', 'CSV') == 'JSON':
            json_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
            )

        elif destination['bigquery'].get('is_incremental_load', False) == True:
            incremental_rows_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get(
                    'skip_rows',
                    1),  #0 if 'schema' in destination['bigquery'] else 1),
                destination['bigquery'].get('disposition', 'WRITE_APPEND'),
                billing_project_id=project.id)

        else:
            rows_to_table(
                destination['bigquery'].get('auth', auth),
                destination['bigquery'].get('project_id', project.id),
                destination['bigquery']['dataset'],
                destination['bigquery']['table'] + variant,
                rows,
                destination['bigquery'].get('schema', []),
                destination['bigquery'].get(
                    'skip_rows',
                    1),  #0 if 'schema' in destination['bigquery'] else 1),
                destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
            )

    if 'sheets' in destination:
        if destination['sheets'].get('delete', False):
            sheets_clear(auth, destination['sheets']['sheet'],
                         destination['sheets']['tab'] + variant,
                         destination['sheets']['range'])
        sheets_write(auth, destination['sheets']['sheet'],
                     destination['sheets']['tab'] + variant,
                     destination['sheets']['range'], rows)

    if 'directory' in destination:
        file_out = destination['directory'] + variant + filename
        if project.verbose: print 'SAVING', file_out
        makedirs_safe(parse_path(file_out))
        with open(file_out, 'wb') as save_file:
            save_file.write(rows_to_csv(rows).read())

    if 'storage' in destination and destination['storage'].get(
            'bucket') and destination['storage'].get('path'):
        # create the bucket
        bucket_create(auth, project.id, destination['storage']['bucket'])

        # put the file
        file_out = destination['storage']['bucket'] + ':' + destination[
            'storage']['path'] + variant + filename
        if project.verbose: print 'SAVING', file_out
        object_put(auth, file_out, rows_to_csv(rows))

    # deprecated do not use
    if 'trix' in destination:
        trix_update(auth, destination['trix']['sheet_id'],
                    destination['trix']['sheet_range'], rows_to_csv(rows),
                    destination['trix']['clear'])

    if 'email' in destination:
        pass

    if 'sftp' in destination:
        try:
            sys.stderr = StringIO()

            cnopts = pysftp.CnOpts()
            cnopts.hostkeys = None

            file_prefix = 'report'
            if 'file_prefix' in destination['sftp']:
                file_prefix = destination['sftp'].get('file_prefix')
                del destination['sftp']['file_prefix']

            #sftp_configs = destination['sftp']
            #sftp_configs['cnopts'] = cnopts
            #sftp = pysftp.Connection(**sftp_configs)

            sftp = pysftp.Connection(host=destination['sftp']['host'],
                                     username=destination['sftp']['username'],
                                     password=destination['sftp']['password'],
                                     port=destination['sftp']['port'],
                                     cnopts=cnopts)

            if 'directory' in destination['sftp']:
                sftp.cwd(destination['sftp']['directory'])

            tmp_file_name = '/tmp/%s_%s.csv' % (
                file_prefix,
                datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S'))

            tmp_file = open(tmp_file_name, 'wb')
            tmp_file.write(rows_to_csv(rows).read())
            tmp_file.close()

            sftp.put(tmp_file_name)

            os.remove(tmp_file_name)

            sys.stderr = sys.__stderr__
        except e:
            print e
            traceback.print_exc()

Exemplo n.º 21

0

Exibir arquivo

def put_rows(config, auth, destination, rows, schema=None, variant=''):
  """Processes standard write JSON block for dynamic export of data.

  Allows us to quickly write the results of a script to a destination.  For
  example
  write the results of a DCM report into BigQuery.

  - Will write to multiple destinations if specified.
  - Extensible, add a handler to define a new destination ( be kind update the
  documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.

    from utils.data import put_rows

    var_json = {
      "out":{
        "bigquery":{
          "auth":"[ user or service ]",
          "dataset": [ string ],
          "table": [ string ]
          "schema": [ json - standard bigquery schema json ],
          "header": [ boolean - true if header exists in rows ]
          "disposition": [ string - same as BigQuery documentation ]
        },
        "sheets":{
          "auth":"[ user or service ]",
          "sheet":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
          "append": [ boolean - if sheet range should be appended to ]
          "delete": [ boolean - if sheet range should be cleared before writing ]
          ]
        },
        "storage":{
          "auth":"[ user or service ]",
          "bucket": [ string ],
          "path": [ string ]
        },
        "file":[ string - full path to place to write file ]
      }
    }

    values = put_rows('user', var_json)

  Args:
    auth: (string) The type of authentication to use, user or service.
    rows: ( iterator ) The list of rows to be written, if NULL no action is performed.
    schema: (json) A bigquery schema definition.
    destination: (json) A json block resembling var_json described above. rows (
      list ) The data being written as a list object. variant (string) Appended
      to destination to differentieate multiple objects

  Returns:
    If unnest is False: Returns a list of row values [[v1], [v2], ... ]
    If unnest is True: Returns a list of values [v1, v2, ...]
"""

  if rows is None:
    if config.verbose:
      print('PUT ROWS: Rows is None, ignoring write.')
    return

  if 'bigquery' in destination:

    if not schema:
      schema = destination['bigquery'].get('schema')

    skip_rows = 1 if destination['bigquery'].get('header') and schema else 0

    if destination['bigquery'].get('format', 'CSV') == 'JSON':
      json_to_table(
          config,
          destination['bigquery'].get('auth', auth),
          destination['bigquery'].get('project_id', config.project),
          destination['bigquery']['dataset'],
          destination['bigquery']['table'] + variant,
          rows,
          schema,
          destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
      )

    elif destination['bigquery'].get('is_incremental_load', False) == True:
      incremental_rows_to_table(
          config,
          destination['bigquery'].get('auth', auth),
          destination['bigquery'].get('project_id', config.project),
          destination['bigquery']['dataset'],
          destination['bigquery']['table'] + variant,
          rows,
          schema,
          destination['bigquery'].get('skip_rows', skip_rows),
          destination['bigquery'].get('disposition', 'WRITE_APPEND'),
          billing_project_id=config.project)

    else:

      rows_to_table(
          config,
          destination['bigquery'].get('auth', auth),
          destination['bigquery'].get('project_id', config.project),
          destination['bigquery']['dataset'],
          destination['bigquery']['table'] + variant,
          rows,
          schema,
          destination['bigquery'].get('skip_rows', skip_rows),
          destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
      )

  if 'sheets' in destination:
    if destination['sheets'].get('delete', False):
      sheets_clear(
        config,
        destination['sheets'].get('auth', auth),
        destination['sheets']['sheet'],
        destination['sheets']['tab'] + variant,
        destination['sheets']['range'],
      )

    sheets_write(
      config,
      destination['sheets'].get('auth', auth),
      destination['sheets']['sheet'],
      destination['sheets']['tab'] + variant,
      destination['sheets']['range'],
      rows_to_type(rows),
      destination['sheets'].get('append', False),
    )

  if 'file' in destination:
    path_out, file_ext = destination['file'].rsplit('.', 1)
    file_out = path_out + variant + '.' + file_ext
    if config.verbose:
      print('SAVING', file_out)
    makedirs_safe(parse_path(file_out))
    with open(file_out, 'w') as save_file:
      save_file.write(rows_to_csv(rows).read())

  if 'storage' in destination and destination['storage'].get(
      'bucket') and destination['storage'].get('path'):
    bucket_create(
      config,
      destination['storage'].get('auth', auth),
      config.project,
      destination['storage']['bucket']
    )

    # put the file
    file_out = destination['storage']['bucket'] + ':' + destination['storage'][
        'path'] + variant
    if config.verbose:
      print('SAVING', file_out)
    object_put(config, auth, file_out, rows_to_csv(rows))

  if 'sftp' in destination:
    try:
      cnopts = pysftp.CnOpts()
      cnopts.hostkeys = None

      path_out, file_out = destination['sftp']['file'].rsplit('.', 1)
      file_out = path_out + variant + file_out

      sftp = pysftp.Connection(
          host=destination['sftp']['host'],
          username=destination['sftp']['username'],
          password=destination['sftp']['password'],
          port=destination['sftp']['port'],
          cnopts=cnopts)

      if '/' in file_out:
        dir_out, file_out = file_out.rsplit('/', 1)
        sftp.cwd(dir_out)

      sftp.putfo(rows_to_csv(rows), file_out)

    except e:
      print(str(e))
      traceback.print_exc()

Exemplo n.º 22

0

Exibir arquivo

Arquivo: run.py Projeto: hernanperalta/starthinker

def create_dv360_segments(project):
    a1_notation = 'A:M'
    schema = [{
        "type": "STRING",
        "name": "Advertiser",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Advertiser_Id",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Campaign",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Campaign_Id",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Insertion_Order",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Insertion_Order_Id",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Line_Item",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Line_Item_Id",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Line_Item_Type",
        "mode": "NULLABLE"
    }, {
        "type": "INTEGER",
        "name": "Impressions",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Segment1",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Segment2",
        "mode": "NULLABLE"
    }, {
        "type": "STRING",
        "name": "Segment3",
        "mode": "NULLABLE"
    }]

    sheet_rows = sheets_read('user',
                             project.task['sheet'],
                             'DV3 Segments',
                             a1_notation,
                             retries=10)

    if not sheet_rows:
        sheet_rows = []

    print('DV360 SEGMENT SHEET TABLE WRITE')

    rows_to_table(auth='service',
                  project_id=project.id,
                  dataset_id=project.task['dataset'],
                  table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE,
                  rows=sheet_rows,
                  schema=schema,
                  skip_rows=1,
                  disposition='WRITE_TRUNCATE')

    # Run Query
    path = os.path.join(os.path.dirname(__file__),
                        SQL_DIRECTORY + DV360_CUSTOM_SEGMENTS_FILENAME)
    query = ''
    with open(path, 'r') as file:
        data = file.read().replace('\n', ' ')
        query = data.replace("{{project_id}}",
                             project.id).replace("{{dataset}}",
                                                 project.task['dataset'])

    print('DV360 CUSTOM SEGMENT TABLE')

    query_to_table('service',
                   project.id,
                   project.task['dataset'],
                   DV360_CUSTOM_SEGMENTS_TABLE,
                   query,
                   legacy=False)

    # Move Table back to sheets
    query = 'SELECT * from `' + project.id + '.' + project.task[
        'dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`'
    rows = query_to_rows('service',
                         project.id,
                         project.task['dataset'],
                         query,
                         legacy=False)

    # makes sure types are correct in sheet
    a1_notation = a1_notation[:1] + '2' + a1_notation[1:]
    rows = rows_to_type(rows)
    sheets_clear('user', project.task['sheet'], 'DV3 Segments', a1_notation)
    sheets_write('user', project.task['sheet'], 'DV3 Segments', a1_notation,
                 rows)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: run.py Projeto: Gregorfran/starthinker-gregor

def sheets():
    if project.verbose: print('SHEETS')

    # if sheet or tab is missing, don't do anything
    if not project.task.get('sheet') or not project.task.get('tab'):
        if project.verbose: print('Missing Sheet and/or Tab, skipping task.')
        return

    # delete if specified, will delete sheet if no more tabs remain
    if project.task.get('delete', False):
        sheets_tab_delete(project.task['auth'], project.task['sheet'],
                          project.task['tab'])

    # create a sheet and tab if specified, if template
    if 'template' in project.task:
        sheets_create(
            project.task['auth'],
            project.task['sheet'],
            project.task['tab'],
            project.task['template'].get('sheet'),
            project.task['template'].get('tab'),
        )

    # copy template if specified ( clear in this context means overwrite )
    #if project.task.get('template', {}).get('sheet'):
    #  sheets_tab_copy(
    #    project.task['auth'],
    #    project.task['template']['sheet'],
    #    project.task['template']['tab'],
    #    project.task['sheet'],
    #    project.task['tab'],
    #    project.task.get('clear', False)
    #  )

    # if no template at least create tab
    #else:
    #  sheets_tab_create(
    #    project.task['auth'],
    #    project.task['sheet'],
    #    project.task['tab']
    #  )

    # clear if specified
    if project.task.get('clear', False):
        sheets_clear(project.task['auth'], project.task['sheet'],
                     project.task['tab'], project.task.get('range', 'A1'))

    # write data if specified
    if 'write' in project.task:
        rows = get_rows(project.task['auth'], project.task['write'])
        sheets_write(project.task['auth'],
                     project.task['sheet'],
                     project.task['tab'],
                     project.task['range'],
                     rows,
                     append=False)

    # append data if specified
    if 'append' in project.task:
        rows = get_rows(project.task['auth'], project.task['append'])
        sheets_write(project.task['auth'],
                     project.task['sheet'],
                     project.task['tab'],
                     project.task['range'],
                     rows,
                     append=True)

    # move data if specified
    # move data if specified
    if 'out' in project.task:
        rows = sheets_read(project.task['auth'],
                           project.task['sheet'], project.task['tab'],
                           project.task.get('range', 'A1'))

        if rows:
            schema = None

            # RECOMMENDED: define schema in json
            if project.task['out']['bigquery'].get('schema'):
                if project.verbose: print('SHEETS SCHEMA DEFINED')
                schema = project.task['out']['bigquery']['schema']

            # NOT RECOMMENDED: determine schema if missing
            else:
                if project.verbose:
                    print(
                        'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )'
                    )
                # cast rows to types ( for schema detection )
                rows = rows_to_type(rows)
                rows, schema = get_schema(rows,
                                          project.task.get('header', False),
                                          infer_type=project.task.get(
                                              'infer_type', True))

            # write to table ( not using put because no use cases for other destinations )
            rows_to_table(
                auth=project.task['out'].get('auth', project.task['auth']),
                project_id=project.id,
                dataset_id=project.task['out']['bigquery']['dataset'],
                table_id=project.task['out']['bigquery']['table'],
                rows=rows,
                schema=schema,
                skip_rows=1 if project.task.get('header', False) else 0,
                disposition=project.task['out']['bigquery'].get(
                    'disposition', 'WRITE_TRUNCATE'))

        else:
            print('SHEET EMPTY')

Exemplo n.º 24

0

Exibir arquivo

def main():
    # get parameters
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent("""\
    Command line to get table schema from BigQuery.

    Helps developers upload data to BigQuery and pull schemas.  These are the
    most common BigQuery tasks when developing solutions.

    Examples:
      Display table schema: `python helper.py --project [id] --dataset [name] --table [name] -s [credentials]`
      Upload csv table: `python helper.py --project [id] --dataset [name] --table [name] --csv [file] --schema [file] -s [credentials]`
      Upload excel sheet: `python helper.py --project [id] --dataset [name] --table [name] --excel_file [file] --excel_sheet [name] --schema [file] -s [credentials]`

  """))

    parser.add_argument('--dataset',
                        help='name of BigQuery dataset',
                        default=None)
    parser.add_argument('--table', help='name of BigQuery table', default=None)
    parser.add_argument('--csv', help='CSV file path', default=None)
    parser.add_argument('--schema', help='SCHEMA file path', default=None)
    parser.add_argument('--excel_workbook',
                        help='Excel file path',
                        default=None)
    parser.add_argument('--excel_sheet', help='Excel sheet name', default=None)

    # initialize project
    parser = commandline_parser(parser,
                                arguments=('-u', '-c', '-s', '-v', '-p'))
    args = parser.parse_args()
    config = Configuration(user=args.user,
                           client=args.client,
                           service=args.service,
                           verbose=args.verbose,
                           project=args.project)

    auth = 'service' if args.service else 'user'

    schema = json.loads(args.schema) if args.schema else None

    if args.csv:

        with open(args.csv, 'r') as csv_file:
            rows = csv_to_rows(csv_file.read())

            if not schema:
                rows, schema = get_schema(rows)
                print('DETECETED SCHEMA', json.dumps(schema))
                print('Please run again with the above schema provided.')
                exit()

            rows_to_table(config, auth, config.project, args.dataset,
                          args.table, rows, schema)

    elif args.excel_workbook and args.excel_sheet:
        with open(args.excel_workbook, 'r') as excel_file:
            rows = excel_to_rows(excel_file, args.excel_sheet)

            if not schema:
                rows, schema = get_schema(rows)
                print('DETECETED SCHEMA', json.dumps(schema))
                print('Please run again with the above schema provided.')
                exit()

            rows_to_table(config, auth, config.project, args.dataset,
                          args.table, rows, schema)

    else:
        # print schema
        print(
            json.dumps(table_to_schema(config, auth, config.project,
                                       args.dataset, args.table),
                       indent=2))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: run.py Projeto: dvandra/starthinker

def sheets():
    if project.verbose: print 'SHEETS'

    # clear if specified
    if project.task.get('clear', False):
        sheets_clear(project.task['auth'], project.task['sheet'],
                     project.task['tab'], project.task['range'])

    # delete if specified ( after clear to prevent errors in case both are given )
    if project.task.get('delete', False):
        sheets_tab_delete(project.task['auth'], project.task['sheet'],
                          project.task['tab'])

    # create or copy if specified
    if 'template' in project.task:
        sheets_tab_copy(project.task['auth'],
                        project.task['template']['sheet'],
                        project.task['template']['tab'], project.task['sheet'],
                        project.task['tab'])
    else:
        sheets_tab_create(project.task['auth'], project.task['sheet'],
                          project.task['tab'])

    # write data if specified
    if 'write' in project.task:
        rows = get_rows(project.task['auth'], project.task['write'])
        sheets_write(project.task['auth'], project.task['sheet'],
                     project.task['tab'], project.task['range'], rows)

    # move if specified
    if 'out' in project.task:
        rows = sheets_read(project.task['auth'], project.task['sheet'],
                           project.task['tab'], project.task['range'])

        if rows:
            schema = None

            # RECOMMENDED: define schema in json
            if project.task['out']['bigquery'].get('schema'):
                if project.verbose: print 'SHEETS SCHEMA DEFINED'
                schema = project.task['out']['bigquery']['schema']
            # NOT RECOMMENDED: determine schema if missing
            else:
                if project.verbose:
                    print 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )'
                # cast rows to types ( for schema detection )
                rows = rows_to_type(rows)
                rows, schema = get_schema(rows,
                                          project.task.get('header', False),
                                          infer_type=project.task.get(
                                              'infer_type', True))

            # write to table ( not using put because no use cases for other destinations )
            rows_to_table(
                auth=project.task['out'].get('auth', project.task['auth']),
                project_id=project.id,
                dataset_id=project.task['out']['bigquery']['dataset'],
                table_id=project.task['out']['bigquery']['table'],
                rows=rows,
                schema=schema,
                skip_rows=1 if project.task.get('header', False) else 0,
                disposition=project.task['out']['bigquery'].get(
                    'disposition', 'WRITE_TRUNCATE'))

        else:
            print 'SHEET EMPTY'