def table_to_rows(auth, project_id, dataset_id, table_id, fields=None, row_start=0, row_max=None, as_object=False): if project.verbose: print('BIGQUERY ROWS:', project_id, dataset_id, table_id) table = API_BigQuery(auth).tables().get(projectId=project_id, datasetId=dataset_id, tableId=table_id).execute() table_schema = table['schema'].get('fields', []) table_type = table['type'] table_legacy = table.get('view', {}).get('useLegacySql', False) if table_type == 'TABLE': for row in API_BigQuery(auth, iterate=True).tabledata().list( projectId=project_id, datasetId=dataset_id, tableId=table_id, selectedFields=fields, startIndex=row_start, maxResults=row_max, ).execute(): yield row_to_json(row, table_schema, as_object) else: yield from query_to_rows(auth, project_id, dataset_id, 'SELECT * FROM %s' % table_id, row_max, table_legacy, as_object)
def query_to_rows(auth, project_id, dataset_id, query, row_max=None, legacy=True, as_object=False): if project.verbose: print('BIGQUERY QUERY:', project_id, dataset_id) # Create the query body = { 'kind': 'bigquery#queryRequest', 'query': query, 'timeoutMs': 10000, 'dryRun': False, 'useQueryCache': True, 'useLegacySql': legacy } if row_max: body['maxResults'] = row_max if dataset_id: body['defaultDataset'] = {'projectId': project_id, 'datasetId': dataset_id} # wait for query to complete response = API_BigQuery(auth).jobs().query( projectId=project_id, body=body).execute() while not response['jobComplete']: sleep(5) response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId']).execute(iterate=False) # fetch query results schema = response.get('schema', {}).get('fields', None) row_count = 0 while 'rows' in response: for row in response['rows']: yield row_to_json(row, schema, as_object) row_count += 1 if 'PageToken' in response: response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], pageToken=response['PageToken']).execute(iterate=False) elif row_count < int(response['totalRows']): response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], startIndex=row_count).execute(iterate=False) else: break
def query_to_rows(auth, project_id, dataset_id, query, row_max=None, legacy=True): # Create the query body = { "kind": "bigquery#queryRequest", "query": query, "timeoutMs": 10000, "dryRun": False, "useQueryCache": True, "useLegacySql": legacy } if row_max: body['maxResults'] = row_max if dataset_id: body['defaultDataset'] = { "projectId": project_id, "datasetId": dataset_id } # wait for query to complete response = API_BigQuery(auth).jobs().query(projectId=project_id, body=body).execute() while not response['jobComplete']: sleep(5) response = API_BigQuery(auth).jobs().getQueryResults(projectId=project_id, jobId=response['jobReference']['jobId']).execute(iterate=False) # fetch query results row_count = 0 while 'rows' in response: converters = _build_converter_array(response.get('schema', None), None, len(response['rows'][0].get('f'))) for row in response['rows']: yield [converters[i](next(iter(r.values()))) for i, r in enumerate(row['f'])] # may break if we attempt nested reads row_count += 1 if 'PageToken' in response: response = API_BigQuery(auth).jobs().getQueryResults(projectId=project_id, jobId=response['jobReference']['jobId'], pageToken=response['PageToken']).execute(iterate=False) elif row_count < int(response['totalRows']): response = API_BigQuery(auth).jobs().getQueryResults(projectId=project_id, jobId=response['jobReference']['jobId'], startIndex=row_count).execute(iterate=False) else: break