示例#1
0
def _pipeline_cell(args, cell_body):
  """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines.

   The supported syntax is:
   %%bigquery pipeline [-q|--sql <query identifier>] <other args> <action>
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the arguments following '%bigquery pipeline'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  if args['action'] == 'deploy':
    raise Exception('Deploying a pipeline is not yet supported')

  env = {}
  for key, value in _utils.notebook_environment().iteritems():
    if isinstance(value, gcp.bigquery._udf.UDF):
      env[key] = value

  query = _get_query_argument(args, cell_body, env)
  if args['verbose']:
    print query.sql
  if args['action'] == 'dryrun':
    print(query.sql)
    result = query.execute_dry_run()
    return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                                is_cached=result['cacheHit'])
  if args['action'] == 'run':
    return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                         allow_large_results=args['large']).results
示例#2
0
def _pipeline_cell(args, cell_body):
  """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines.

   The supported syntax is:
   %%bigquery pipeline [-q|--sql <query identifier>] <other args> <action>
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the arguments following '%bigquery pipeline'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  if args['action'] == 'deploy':
    raise Exception('Deploying a pipeline is not yet supported')

  env = {}
  for key, value in _utils.notebook_environment().iteritems():
    if isinstance(value, gcp.bigquery._udf.FunctionCall):
      env[key] = value

  query = _get_query_argument(args, cell_body, env)
  if args['verbose']:
    print query.sql
  if args['action'] == 'dryrun':
    print(query.sql)
    result = query.execute_dry_run()
    return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                                is_cached=result['cacheHit'])
  if args['action'] == 'run':
    return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                         allow_large_results=args['large']).results
示例#3
0
def bigquery(line, cell=None):
  """Implements the bigquery cell magic for ipython notebooks.

    The supported syntax is:

      %%bigquery <command> [<args>]
      <cell>

    or:

      %bigquery <command> [<args>]

    Use %bigquery --help for a list of commands, or %bigquery <command> --help for help
    on a specific command.

  Args:
    line: the magic line.
    cell: the body of the notebook cell.

  Returns:
    The result of processing the magic.
  """
  namespace = {}
  if line.find('$') >= 0:
    # We likely have variables to expand; get the appropriate context.
    namespace = _utils.notebook_environment()

  return _utils.handle_magic_line(line, cell, _bigquery_parser, namespace=namespace)
示例#4
0
def _create_cell(args, cell_body):
  """Implements the BigQuery cell magic used to create datasets and tables.

   The supported syntax is:

     %%bigquery create dataset -n|--name <name> [-f|--friendly <friendlyname>]
     [<description>]

   or:

     %%bigquery create table -n|--name <tablename> [--overwrite]
     [<YAML or JSON cell_body defining schema to use for tables>]

  Args:
    args: the argument following '%bigquery create <command>'.
  """
  if args['command'] == 'dataset':
    try:
      gcp.bigquery.DataSet(args['name']).create(friendly_name=args['friendly'],
                                                description=cell_body)
    except Exception as e:
      print 'Failed to create dataset %s: %s' % (args['name'], e)
  else:
    if cell_body is None:
      print 'Failed to create %s: no schema specified' % args['name']
    else:
      try:
        record = _utils.parse_config(cell_body, _utils.notebook_environment(), as_dict=False)
        schema = gcp.bigquery.Schema(record)
        gcp.bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite'])
      except Exception as e:
        print 'Failed to create table %s: %s' % (args['name'], e)
示例#5
0
def _sample_cell(args, cell_body):
  """Implements the bigquery sample cell magic for ipython notebooks.

  Args:
    args: the optional arguments following '%%bigquery sample'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The results of executing the sampling query, or a profile of the sample data.
  """

  env = _utils.notebook_environment()
  query = None
  table = None
  view = None

  if args['query']:
    query = _get_query_argument(args, cell_body, env)
  elif args['table']:
    table = _get_table(args['table'])
  elif args['view']:
    view = _utils.get_notebook_item(args['view'])
    if not isinstance(view, gcp.bigquery.View):
      raise Exception('%s is not a view' % args['view'])
  else:
    query = gcp.bigquery.Query(cell_body, values=env)

  count = args['count']
  method = args['method']
  if method == 'random':
    sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count)
  elif method == 'hashed':
    sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'],
                                            percent=args['percent'],
                                            count=count)
  elif method == 'sorted':
    ascending = args['order'] == 'ascending'
    sampling = gcp.bigquery.Sampling.sorted(args['field'],
                                            ascending=ascending,
                                            count=count)
  elif method == 'limit':
    sampling = gcp.bigquery.Sampling.default(count=count)
  else:
    sampling = gcp.bigquery.Sampling.default(count=count)

  if query:
    results = query.sample(sampling=sampling)
  elif view:
    results = view.sample(sampling=sampling)
  else:
    results = table.sample(sampling=sampling)
  if args['verbose']:
    print results.sql
  if args['profile']:
    return _utils.profile_df(results.to_dataframe())
  else:
    return results
示例#6
0
def _execute_cell(args, cell_body):
  """Implements the BigQuery cell magic used to execute BQ queries.

   The supported syntax is:
   %%bigquery execute [-q|--sql <query identifier>] <other args>
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the arguments following '%bigquery execute'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  query = _get_query_argument(args, cell_body, _utils.notebook_environment())
  if args['verbose']:
    print query.sql
  return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                       allow_large_results=args['large']).results
示例#7
0
def _execute_cell(args, cell_body):
  """Implements the BigQuery cell magic used to execute BQ queries.

   The supported syntax is:
   %%bigquery execute [-q|--sql <query identifier>] <other args>
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the arguments following '%bigquery execute'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  query = _get_query_argument(args, cell_body, _utils.notebook_environment())
  if args['verbose']:
    print query.sql
  return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                       allow_large_results=args['large']).results
示例#8
0
def _dryrun_cell(args, cell_body):
  """Implements the BigQuery cell magic used to dry run BQ queries.

   The supported syntax is:
   %%bigquery dryrun [-q|--sql <query identifier>]
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the argument following '%bigquery dryrun'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The response wrapped in a DryRunStats object
  """
  query = _get_query_argument(args, cell_body, _utils.notebook_environment())

  if args['verbose']:
    print query.sql
  result = query.execute_dry_run()
  return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                              is_cached=result['cacheHit'])
示例#9
0
def _dryrun_cell(args, cell_body):
  """Implements the BigQuery cell magic used to dry run BQ queries.

   The supported syntax is:
   %%bigquery dryrun [-q|--sql <query identifier>]
   [<YAML or JSON cell_body or inline SQL>]

  Args:
    args: the argument following '%bigquery dryrun'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The response wrapped in a DryRunStats object
  """
  query = _get_query_argument(args, cell_body, _utils.notebook_environment())

  if args['verbose']:
    print query.sql
  result = query.execute_dry_run()
  return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                              is_cached=result['cacheHit'])
示例#10
0
def _udf_cell(args, js):
  """Implements the bigquery_udf cell magic for ipython notebooks.

  The supported syntax is:
  %%bigquery udf --module <var>
  <js function>

  Args:
    args: the optional arguments following '%%bigquery udf'.
    declaration: the variable to initialize with the resulting UDF object.
    js: the UDF declaration (inputs and outputs) and implementation in javascript.
  Returns:
    The results of executing the UDF converted to a dataframe if no variable
    was specified. None otherwise.
  """
  variable_name = args['module']
  if not variable_name:
    raise Exception('Declaration must be of the form %%bigquery udf --module <variable name>')

  # Parse out the input and output specification
  spec_pattern = r'\{\{([^}]+)\}\}'
  spec_part_pattern = r'[a-z_][a-z0-9_]*'

  specs = re.findall(spec_pattern, js)
  if len(specs) < 2:
    raise Exception('The JavaScript must declare the input row and output emitter parameters '
                    'using valid jsdoc format comments.\n'
                    'The input row param declaration must be typed as {{field:type, field2:type}} '
                    'and the output emitter param declaration must be typed as '
                    'function({{field:type, field2:type}}.')

  inputs = []
  input_spec_parts = re.findall(spec_part_pattern, specs[0], flags=re.IGNORECASE)
  if len(input_spec_parts) % 2 != 0:
    raise Exception('Invalid input row param declaration. The jsdoc type expression must '
                    'define an object with field and type pairs.')
  for n, t in zip(input_spec_parts[0::2], input_spec_parts[1::2]):
    inputs.append((n, t))

  outputs = []
  output_spec_parts = re.findall(spec_part_pattern, specs[1], flags=re.IGNORECASE)
  if len(output_spec_parts) % 2 != 0:
    raise Exception('Invalid output emitter param declaration. The jsdoc type expression must '
                    'define a function accepting an an object with field and type pairs.')
  for n, t in zip(output_spec_parts[0::2], output_spec_parts[1::2]):
    outputs.append((n, t))

  # Look for imports. We use a non-standard @import keyword; we could alternatively use @requires.
  # Object names can contain any characters except \r and \n.
  import_pattern = r'@import[\s]+(gs://[a-z\d][a-z\d_\.\-]*[a-z\d]/[^\n\r]+)'
  imports = re.findall(import_pattern, js)

  # Split the cell if necessary. We look for a 'function(' with no name and a header comment
  # block with @param and assume this is the primary function, up to a closing '}' at the start
  # of the line. The remaining cell content is used as support code.
  split_pattern = r'(.*)(/\*.*?@param.*?@param.*?\*/\w*\n\w*function\w*\(.*?^}\n?)(.*)'
  parts = re.match(split_pattern, js, re.MULTILINE|re.DOTALL)
  support_code = ''
  if parts:
    support_code = (parts.group(1) + parts.group(3)).strip()
    if len(support_code):
      js = parts.group(2)

  # Finally build the UDF object
  udf = gcp.bigquery.UDF(inputs, outputs, variable_name, js, support_code, imports)
  _utils.notebook_environment()[variable_name] = udf
示例#11
0
def _udf_cell(args, js):
  """Implements the bigquery_udf cell magic for ipython notebooks.

  The supported syntax is:
  %%bigquery udf --module <var>
  <js function>

  Args:
    args: the optional arguments following '%%bigquery udf'.
    js: the UDF declaration (inputs and outputs) and implementation in javascript.
  Returns:
    The results of executing the UDF converted to a dataframe if no variable
    was specified. None otherwise.
  """
  variable_name = args['module']
  if not variable_name:
    raise Exception('Declaration must be of the form %%bigquery udf --module <variable name>')

  # Parse out the input and output specification
  spec_pattern = r'\{\{([^}]+)\}\}'
  spec_part_pattern = r'[a-z_][a-z0-9_]*'

  specs = re.findall(spec_pattern, js)
  if len(specs) < 2:
    raise Exception('The JavaScript must declare the input row and output emitter parameters '
                    'using valid jsdoc format comments.\n'
                    'The input row param declaration must be typed as {{field:type, field2:type}} '
                    'and the output emitter param declaration must be typed as '
                    'function({{field:type, field2:type}}.')

  inputs = []
  input_spec_parts = re.findall(spec_part_pattern, specs[0], flags=re.IGNORECASE)
  if len(input_spec_parts) % 2 != 0:
    raise Exception('Invalid input row param declaration. The jsdoc type expression must '
                    'define an object with field and type pairs.')
  for n, t in zip(input_spec_parts[0::2], input_spec_parts[1::2]):
    inputs.append((n, t))

  outputs = []
  output_spec_parts = re.findall(spec_part_pattern, specs[1], flags=re.IGNORECASE)
  if len(output_spec_parts) % 2 != 0:
    raise Exception('Invalid output emitter param declaration. The jsdoc type expression must '
                    'define a function accepting an an object with field and type pairs.')
  for n, t in zip(output_spec_parts[0::2], output_spec_parts[1::2]):
    outputs.append((n, t))

  # Look for imports. We use a non-standard @import keyword; we could alternatively use @requires.
  # Object names can contain any characters except \r and \n.
  import_pattern = r'@import[\s]+(gs://[a-z\d][a-z\d_\.\-]*[a-z\d]/[^\n\r]+)'
  imports = re.findall(import_pattern, js)

  # Split the cell if necessary. We look for a 'function(' with no name and a header comment
  # block with @param and assume this is the primary function, up to a closing '}' at the start
  # of the line. The remaining cell content is used as support code.
  split_pattern = r'(.*)(/\*.*?@param.*?@param.*?\*/\w*\n\w*function\w*\(.*?^}\n?)(.*)'
  parts = re.match(split_pattern, js, re.MULTILINE|re.DOTALL)
  support_code = ''
  if parts:
    support_code = (parts.group(1) + parts.group(3)).strip()
    if len(support_code):
      js = parts.group(2)

  # Finally build the UDF object
  udf = gcp.bigquery.UDF(inputs, outputs, variable_name, js, support_code, imports)
  _utils.notebook_environment()[variable_name] = udf