def test_datalab_load_table_from_gcs_csv(to_delete): # [START bigquery_migration_datalab_load_table_from_gcs_csv] import google.datalab.bigquery as bq # Create the dataset dataset_id = 'import_sample' # [END bigquery_migration_datalab_load_table_from_gcs_csv] # Use unique dataset ID to avoid collisions when running tests dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000)) to_delete.append(dataset_id) # [START bigquery_migration_datalab_load_table_from_gcs_csv] bq.Dataset(dataset_id).create() # Create the table schema = [ {'name': 'name', 'type': 'STRING'}, {'name': 'post_abbr', 'type': 'STRING'}, ] table = bq.Table( '{}.us_states'.format(dataset_id)).create(schema=schema) table.load( 'gs://cloud-samples-data/bigquery/us-states/us-states.csv', mode='append', source_format='csv', csv_options=bq.CSVOptions(skip_leading_rows=1) ) # Waits for the job to complete # [END bigquery_migration_datalab_load_table_from_gcs_csv] assert table.length == 50
def execute(self, context): table = bq.Table(self.table, context=None) if not table.exists(): table.create(schema=self.schema) kwargs = {} if 'delimiter' in self.csv_options: kwargs['delimiter'] = self.csv_options['delimiter'] if 'skip' in self.csv_options: kwargs['skip_leading_rows'] = self.csv_options['skip'] if 'strict' in self.csv_options: kwargs['allow_jagged_rows'] = self.csv_options['strict'] if 'quote' in self.csv_options: kwargs['quote'] = self.csv_options['quote'] csv_options = bq.CSVOptions(**kwargs) job = table.load( self.path, mode=self.mode, source_format=('csv' if self.format == 'csv' else 'NEWLINE_DELIMITED_JSON'), csv_options=csv_options, ignore_unknown_values=not self.csv_options.get('strict')) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors)) return {'result': job.result()}
def execute(self, context): if self._table: pydatalab_context = google.datalab.Context.default() table = bq.Table(self._table, context=pydatalab_context) if self._mode == 'create': if table.exists(): raise Exception( "%s already exists; mode should be \'append\' or \'overwrite\'" % self._table) if not self._schema: raise Exception( '%s does not exist, and no schema specified in cell; cannot load.' % self._table) table.create(schema=self._schema) elif not table.exists(): raise Exception('%s does not exist; mode should be \'create\'' % self._table) csv_options = bq.CSVOptions( delimiter=self._csv_options.get('delimiter'), skip_leading_rows=self._csv_options.get('skip'), allow_jagged_rows=self._csv_options.get('strict'), quote=self._csv_options.get('quote')) job = table.load( self._path, mode=self._mode, source_format=('csv' if self._format == 'csv' else 'NEWLINE_DELIMITED_JSON'), csv_options=csv_options, ignore_unknown_values=not self._csv_options.get('strict')) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors))
def _load_cell(args, cell_body): """Implements the BigQuery load magic used to load data from GCS to a table. The supported syntax is: %bq load <optional args> Args: args: the arguments following '%bq load'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: A message about whether the load succeeded or failed. """ name = args['table'] table = _get_table(name) if not table: table = bigquery.Table(name) if args['mode'] == 'create': if table.exists(): raise Exception( 'table %s already exists; use "append" or "overwrite" as mode.' % name) if not cell_body or 'schema' not in cell_body: raise Exception( 'Table does not exist, and no schema specified in cell; cannot load.' ) env = google.datalab.utils.commands.notebook_environment() config = google.datalab.utils.commands.parse_config( cell_body, env, False) schema = config['schema'] # schema can be an instance of bigquery.Schema. # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and # specify "schema: $my_schema" in cell input. if not isinstance(schema, bigquery.Schema): jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(schema) table.create(schema=schema) elif not table.exists(): raise Exception('table %s does not exist; use "create" as mode.' % name) csv_options = bigquery.CSVOptions(delimiter=args['delimiter'], skip_leading_rows=args['skip'], allow_jagged_rows=not args['strict'], quote=args['quote']) job = table.load(args['path'], mode=args['mode'], source_format=args['format'], csv_options=csv_options, ignore_unknown_values=not args['strict']) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors))
def _create_external_data_source(self, skip_header_rows): import google.datalab.bigquery as bq df = self.browse(1, None) # read each column as STRING because we only want to sample rows. schema_train = bq.Schema([{ 'name': name, 'type': 'STRING' } for name in df.keys()]) options = bq.CSVOptions( skip_leading_rows=(1 if skip_header_rows == True else 0)) return bq.ExternalDataSource(self.path, csv_options=options, schema=schema_train, max_bad_records=0)
def execute(self, context): if self.data_source: kwargs = {} if self.csv_options: csv_kwargs = {} if 'delimiter' in self.csv_options: csv_kwargs['delimiter'] = self.csv_options['delimiter'] if 'skip' in self.csv_options: csv_kwargs['skip_leading_rows'] = self.csv_options['skip'] if 'strict' in self.csv_options: csv_kwargs['allow_jagged_rows'] = self.csv_options[ 'strict'] if 'quote' in self.csv_options: csv_kwargs['quote'] = self.csv_options['quote'] kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs) if self.format: kwargs['source_format'] = self.format if self.max_bad_records: kwargs['max_bad_records'] = self.max_bad_records external_data_source = bq.ExternalDataSource(source=self.path, schema=bq.Schema( self.schema), **kwargs) query = bq.Query( sql=self.sql, data_sources={self.data_source: external_data_source}) else: query = bq.Query(sql=self.sql) # use_cache is False since this is most likely the case in pipeline scenarios # allow_large_results can be True only if table is specified (i.e. when it's not None) kwargs = {} if self.mode is not None: kwargs['mode'] = self.mode output_options = bq.QueryOutput.table(name=self.table, use_cache=False, allow_large_results=self.table is not None, **kwargs) query_params = bq.Query.get_query_parameters(self.parameters) job = query.execute(output_options=output_options, query_params=query_params) # Returning the table-name here makes it available for downstream task instances. return {'table': job.result().full_name}