def test_extract_cell_table(self, mock_get_notebook_item, mock_get_table, mock_table_extract): args = { 'table': 'test-table', 'path': 'test-path', 'format': 'json', 'delimiter': None, 'header': None, 'compress': None, 'nocache': None } mock_get_table.return_value = None with self.assertRaisesRegexp(Exception, 'Could not find table test-table'): bq.commands._bigquery._extract_cell(args, None) mock_get_table.return_value = bq.Table('project.test.table', self._create_context()) mock_table_extract.return_value.result = lambda: 'test-results' mock_table_extract.return_value.failed = False mock_table_extract.return_value.errors = None self.assertEqual(bq.commands._bigquery._extract_cell(args, None), 'test-results') mock_table_extract.assert_called_with('test-path', format='json', csv_delimiter=None, csv_header=None, compress=None)
def test_datalab_load_table_from_dataframe(to_delete): # [START bigquery_migration_datalab_load_table_from_dataframe] import google.datalab.bigquery as bq import pandas # Create the dataset dataset_id = 'import_sample' # [END bigquery_migration_datalab_load_table_from_dataframe] # Use unique dataset ID to avoid collisions when running tests dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000)) to_delete.append(dataset_id) # [START bigquery_migration_datalab_load_table_from_dataframe] bq.Dataset(dataset_id).create() # Create the table and load the data dataframe = pandas.DataFrame([ {'title': 'The Meaning of Life', 'release_year': 1983}, {'title': 'Monty Python and the Holy Grail', 'release_year': 1975}, {'title': 'Life of Brian', 'release_year': 1979}, { 'title': 'And Now for Something Completely Different', 'release_year': 1971 }, ]) schema = bq.Schema.from_data(dataframe) table = bq.Table( '{}.monty_python'.format(dataset_id)).create(schema=schema) table.insert(dataframe) # Starts steaming insert of data
def test_table_viewer(self, mock_table_exists, mock_get_field_list, mock_get_data, mock_tables_get, mock_render_chart_data, mock_next_id): test_table = bq.Table('testproject.test.table', self._create_context()) mock_table_exists.return_value = False with self.assertRaisesRegexp(Exception, 'does not exist'): bq.commands._bigquery._table_viewer(test_table) mock_table_exists.return_value = True mock_get_field_list.return_value = ['col1'] mock_get_data.return_value = ({'cols': ['col1'], 'rows': ['val1']}, 1) mock_render_chart_data.return_value = 'test_chart_data' mock_next_id.return_value = 'test_id' viewer = bq.commands._bigquery._table_viewer(test_table) mock_table_exists.assert_called() mock_get_field_list.assert_called() mock_render_chart_data.assert_called() expected_html_header = ''' <div class="bqtv" id="test_id">test_chart_data</div> <br />(testproject.test.table)<br /> ''' self.assertIn(expected_html_header, viewer)
def execute(self, context): table = bq.Table(self.table, context=None) if not table.exists(): table.create(schema=self.schema) kwargs = {} if 'delimiter' in self.csv_options: kwargs['delimiter'] = self.csv_options['delimiter'] if 'skip' in self.csv_options: kwargs['skip_leading_rows'] = self.csv_options['skip'] if 'strict' in self.csv_options: kwargs['allow_jagged_rows'] = self.csv_options['strict'] if 'quote' in self.csv_options: kwargs['quote'] = self.csv_options['quote'] csv_options = bq.CSVOptions(**kwargs) job = table.load( self.path, mode=self.mode, source_format=('csv' if self.format == 'csv' else 'NEWLINE_DELIMITED_JSON'), csv_options=csv_options, ignore_unknown_values=not self.csv_options.get('strict')) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors)) return {'result': job.result()}
def execute(self, context): if self._table: pydatalab_context = google.datalab.Context.default() table = bq.Table(self._table, context=pydatalab_context) if self._mode == 'create': if table.exists(): raise Exception( "%s already exists; mode should be \'append\' or \'overwrite\'" % self._table) if not self._schema: raise Exception( '%s does not exist, and no schema specified in cell; cannot load.' % self._table) table.create(schema=self._schema) elif not table.exists(): raise Exception('%s does not exist; mode should be \'create\'' % self._table) csv_options = bq.CSVOptions( delimiter=self._csv_options.get('delimiter'), skip_leading_rows=self._csv_options.get('skip'), allow_jagged_rows=self._csv_options.get('strict'), quote=self._csv_options.get('quote')) job = table.load( self._path, mode=self._mode, source_format=('csv' if self._format == 'csv' else 'NEWLINE_DELIMITED_JSON'), csv_options=csv_options, ignore_unknown_values=not self._csv_options.get('strict')) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors))
def test_datalab_load_table_from_gcs_csv(to_delete): # [START bigquery_migration_datalab_load_table_from_gcs_csv] import google.datalab.bigquery as bq # Create the dataset dataset_id = 'import_sample' # [END bigquery_migration_datalab_load_table_from_gcs_csv] # Use unique dataset ID to avoid collisions when running tests dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000)) to_delete.append(dataset_id) # [START bigquery_migration_datalab_load_table_from_gcs_csv] bq.Dataset(dataset_id).create() # Create the table schema = [ {'name': 'name', 'type': 'STRING'}, {'name': 'post_abbr', 'type': 'STRING'}, ] table = bq.Table( '{}.us_states'.format(dataset_id)).create(schema=schema) table.load( 'gs://cloud-samples-data/bigquery/us-states/us-states.csv', mode='append', source_format='csv', csv_options=bq.CSVOptions(skip_leading_rows=1) ) # Waits for the job to complete # [END bigquery_migration_datalab_load_table_from_gcs_csv] assert table.length == 50
def test_table_cell_list_dataset(self, mock_dataset, mock_default_context): args = {'command': 'list', 'filter': '', 'dataset': 'test-dataset', 'project': None} tables = [bq.Table('project.test.' + name) for name in ['t1', 't2']] mock_dataset.return_value = iter(tables) self.assertEqual( bq.commands._bigquery._table_cell(args, None), '<ul><li>project.test.t1</li><li>project.test.t2</li></ul>')
def run_analysis(args): """Builds an analysis file for training. Uses BiqQuery tables to do the analysis. Args: args: command line args Raises: ValueError if schema contains unknown types. """ import google.datalab.bigquery as bq if args.bigquery_table: table = bq.Table(args.bigquery_table) schema_list = table.schema._bq_schema else: schema_list = json.loads( file_io.read_file_to_string(args.schema_file).decode()) table = bq.ExternalDataSource(source=args.input_file_pattern, schema=bq.Schema(schema_list)) # Check the schema is supported. for col_schema in schema_list: col_type = col_schema['type'].lower() if col_type != 'string' and col_type != 'integer' and col_type != 'float': raise ValueError('Schema contains an unsupported type %s.' % col_type) run_numerical_analysis(table, schema_list, args) run_categorical_analysis(table, schema_list, args) # Save a copy of the schema to the output location. file_io.write_string_to_file( os.path.join(args.output_dir, SCHEMA_FILE), json.dumps(schema_list, indent=2, separators=(',', ': ')))
def test_get_bq_extract_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQueryToCloudStorage' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'foo_path' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['header'] = False task_details['compress'] = True operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag) """) # noqa task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = BigQueryToCloudStorageOperator(task_id='foo_id', compression=\"\"\"GZIP\"\"\", destination_cloud_storage_uris=[\'foo_path\'], export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", print_header=False, source_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", dag=dag) """) # noqa
def test_get_bq_load_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'GoogleCloudStorageToBigQuery' task_details['table'] = 'foo_project.foo_dataset.foo_table' task_details['path'] = 'gs://foo_bucket/foo_file.csv' task_details['format'] = 'csv' task_details['delimiter'] = '$' task_details['skip'] = False operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"CSV\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag) """) # noqa task_details['format'] = 'json' operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details, None) self.assertEqual( operator_def, """foo = GoogleCloudStorageToBigQueryOperator(task_id='foo_id', bucket=\"\"\"foo_bucket\"\"\", destination_project_dataset_table=\"\"\"foo_project.foo_dataset.foo_table\"\"\", export_format=\"\"\"NEWLINE_DELIMITED_JSON\"\"\", field_delimiter=\"\"\"$\"\"\", skip_leading_rows=False, source_objects=\"\"\"foo_file.csv\"\"\", dag=dag) """) # noqa
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.schema: schema = json.loads( file_io.read_file_to_string(args.schema).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features).decode()) file_io.recursive_create_dir(args.output) if args.cloud: run_cloud_analysis( output_dir=args.output, csv_file_pattern=args.csv, bigquery_table=args.bigquery, schema=schema, features=features) else: feature_analysis.run_local_analysis( output_dir=args.output, csv_file_pattern=args.csv, schema=schema, features=features)
def _get_bq_load_params(operator_task_details): if 'table' in operator_task_details: table = bigquery.commands._bigquery._get_table( operator_task_details['table']) if not table: table = bigquery.Table(operator_task_details['table']) # TODO(rajivpb): Ensure that mode == create here. operator_task_details[ 'destination_project_dataset_table'] = table.full_name del operator_task_details['table'] if 'format' in operator_task_details: operator_task_details['export_format'] = 'CSV' if operator_task_details['format'] == 'csv' \ else 'NEWLINE_DELIMITED_JSON' del operator_task_details['format'] if 'delimiter' in operator_task_details: operator_task_details['field_delimiter'] = operator_task_details[ 'delimiter'] del operator_task_details['delimiter'] if 'skip' in operator_task_details: operator_task_details['skip_leading_rows'] = operator_task_details[ 'skip'] del operator_task_details['skip'] if 'path' in operator_task_details: bucket, source_object = Pipeline._get_bucket_and_source_object( operator_task_details['path']) operator_task_details['bucket'] = bucket operator_task_details['source_objects'] = source_object del operator_task_details['path'] return operator_task_details
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features={ 'col1': { 'transform': 'scale' }, 'col2': { 'transform': 'identity' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_table_cell_view(self, mock_get_table, mock_default_context): args = {'command': 'view', 'name': 'test-table'} table = bq.Table('project.test.table') mock_get_table.return_value = None with self.assertRaisesRegexp(Exception, 'Could not find table test-table'): bq.commands._bigquery._table_cell(args, None) mock_get_table.return_value = table self.assertEqual(table, bq.commands._bigquery._table_cell(args, None))
def _load_cell(args, cell_body): """Implements the BigQuery load magic used to load data from GCS to a table. The supported syntax is: %bq load <optional args> Args: args: the arguments following '%bq load'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: A message about whether the load succeeded or failed. """ name = args['table'] table = _get_table(name) if not table: table = bigquery.Table(name) if args['mode'] == 'create': if table.exists(): raise Exception( 'table %s already exists; use "append" or "overwrite" as mode.' % name) if not cell_body or 'schema' not in cell_body: raise Exception( 'Table does not exist, and no schema specified in cell; cannot load.' ) env = google.datalab.utils.commands.notebook_environment() config = google.datalab.utils.commands.parse_config( cell_body, env, False) schema = config['schema'] # schema can be an instance of bigquery.Schema. # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and # specify "schema: $my_schema" in cell input. if not isinstance(schema, bigquery.Schema): jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(schema) table.create(schema=schema) elif not table.exists(): raise Exception('table %s does not exist; use "create" as mode.' % name) csv_options = bigquery.CSVOptions(delimiter=args['delimiter'], skip_leading_rows=args['skip'], allow_jagged_rows=not args['strict'], quote=args['quote']) job = table.load(args['path'], mode=args['mode'], source_format=args['format'], csv_options=csv_options, ignore_unknown_values=not args['strict']) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors))
def test_load_cell(self, mock_get_table, mock_table_load, mock_table_exists, mock_table_create, mock_default_context): args = {'table': 'project.test.table', 'mode': 'create', 'path': 'test/path_%(_ds)s', 'skip': None, 'csv': None, 'delimiter': None, 'format': 'csv', 'strict': None, 'quote': None} context = self._create_context() mock_get_table.return_value = bq.Table('project.test.table') job = bq._query_job.QueryJob('test_id', 'project.test.table', 'test_sql', context) mock_table_exists.return_value = True with self.assertRaisesRegexp(Exception, 'already exists; use "append" or "overwrite" as mode.'): bq.commands._bigquery._load_cell(args, None) mock_table_exists.return_value = False with self.assertRaisesRegexp(Exception, 'Table does not exist, and no schema specified'): bq.commands._bigquery._load_cell(args, None) cell_body = { 'schema': [ {'name': 'col1', 'type': 'int64', 'mode': 'NULLABLE', 'description': 'description1'}, {'name': 'col1', 'type': 'STRING', 'mode': 'required', 'description': 'description1'} ], 'parameters': [ {'name': 'custom', 'type': 'FLOAT', 'value': 4.23} ] } mock_table_load.return_value = job job._is_complete = True job._fatal_error = 'fatal error' with self.assertRaisesRegexp(Exception, 'Load failed: fatal error'): bq.commands._bigquery._load_cell(args, json.dumps(cell_body)) job._fatal_error = None job._errors = 'error' with self.assertRaisesRegexp(Exception, 'Load completed with errors: error'): bq.commands._bigquery._load_cell(args, json.dumps(cell_body)) job._errors = None bq.commands._bigquery._load_cell(args, json.dumps(cell_body)) today = datetime.now().date().isoformat() mock_table_load.assert_called_with('test/path_{0}'.format(today), mode='create', source_format='csv', csv_options=mock.ANY, ignore_unknown_values=True) mock_get_table.return_value = None mock_table_exists.return_value = True args['mode'] = 'append' args['format'] = 'csv' bq.commands._bigquery._load_cell(args, None) mock_table_load.assert_called_with('test/path_{0}'.format(today), mode='append', source_format='csv', csv_options=mock.ANY, ignore_unknown_values=True)
def test_table_cell_list_bad_filter(self, mock_datasets, mock_default_context): args = {'command': 'list', 'filter': 't7', 'dataset': None, 'project': None} tables = [bq.Table('project.test.' + name) for name in ['t1', 't2', 't11']] ds1 = mock.MagicMock() ds1.__iter__.return_value = iter([tables[0], tables[1]]) ds2 = mock.MagicMock() ds2.__iter__.return_value = iter([tables[2]]) mock_datasets.return_value = iter([ds1, ds2]) self.assertEqual( bq.commands._bigquery._table_cell(args, None), '<pre><empty></pre>')
def test_table_cell_list_project(self, mock_datasets, mock_default_context): args = {'command': 'list', 'filter': '', 'dataset': None, 'project': 'test-project'} tables = [bq.Table('project.test.' + name) for name in ['t1', 't2', 't3']] ds1 = mock.MagicMock() ds1.__iter__.return_value = iter([tables[0], tables[1]]) ds2 = mock.MagicMock() ds2.__iter__.return_value = iter([tables[2]]) mock_datasets.return_value = iter([ds1, ds2]) self.assertEqual( bq.commands._bigquery._table_cell(args, None), '<ul><li>project.test.t1</li><li>project.test.t2</li><li>project.test.t3</li></ul>')
def test_get_bq_execute_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQuery' task_details['query'] = google.datalab.bigquery.Query( 'SELECT * FROM publicdata.samples.wikipedia LIMIT 5') operator_def = pipeline.Pipeline(None, None)._get_operator_definition( task_id, task_details) self.assertEqual(operator_def, "foo = BigQueryOperator(task_id='foo_id', bql='SELECT * FROM publicdata.samples.wikipedia LIMIT 5', use_legacy_sql=False, dag=dag)\n") # noqa
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" try: self._create_test_data() # Make a BQ table, and insert 1 row. project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) table.insert(data=[{'num_col': 23.0, 'img_col': self.img_filepath}]) tfex_dir = os.path.join(self.output_folder, 'test_results') cmd = ['python ' + os.path.join(CODE_PATH, 'transform_raw_data.py'), '--bigquery-table=%s.%s.%s' % (project_id, dataset_name, table_name), '--analyze-output-dir=' + self.output_folder, '--output-filename-prefix=features', '--project-id=' + project_id, '--output-dir=' + tfex_dir] subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(tfex_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_example = next( tf.python_io.tf_record_iterator( record_filepath, options=options)) example = tf.train.Example() example.ParseFromString(serialized_example) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 24.0) image_bytes = example.features.feature['img_col'].bytes_list.value[0] raw_img = Image.open(self.img_filepath).convert('RGB') img_file = six.BytesIO() raw_img.save(img_file, 'jpeg') expected_image_bytes = img_file.getvalue() self.assertEqual(image_bytes, expected_image_bytes) finally: dataset.delete(delete_contents=True) shutil.rmtree(self.output_folder)
def test_table_cell_describe(self, mock_get_table, mock_default_context): args = {'command': 'describe', 'name': 'test-table', 'overwrite': None} mock_get_table.return_value = None with self.assertRaisesRegexp(Exception, 'Could not find table'): bq.commands._bigquery._table_cell(args, None) mock_get_table.return_value = bq.Table('project.test.table') schema = bq.Schema([{'name': 'col1', 'type': 'string'}]) mock_get_table.return_value._schema = schema rendered = bq.commands._bigquery._table_cell(args, None) expected_html1 = 'bq.renderSchema(dom, [{"type": "string", "name": "col1"}]);' expected_html2 = 'bq.renderSchema(dom, [{"name": "col1", "type": "string"}]);' self.assertTrue(expected_html1 in rendered or expected_html2 in rendered)
def test_get_bq_execute_operator_definition(self, mock_table): mock_table.return_value = bq.Table( 'foo_project.foo_dataset.foo_table', context=PipelineTest._create_context()) task_id = 'foo' task_details = {} task_details['type'] = 'BigQuery' # Adding newlines to the query to mimic actual usage of %%bq query ... task_details['query'] = google.datalab.bigquery.Query("""SELECT * FROM publicdata.samples.wikipedia LIMIT 5""") operator_def = pipeline.PipelineGenerator._get_operator_definition(task_id, task_details, None) self.assertEqual(operator_def, """foo = BigQueryOperator(task_id='foo_id', bql=\"\"\"SELECT *\nFROM publicdata.samples.wikipedia\nLIMIT 5\"\"\", use_legacy_sql=False, dag=dag) """) # noqa
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.csv_schema_file: schema = json.loads( file_io.read_file_to_string(args.csv_schema_file).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery_table).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features_file).decode()) expand_defaults(schema, features) # features are updated. check_schema_transforms_match(schema, features) file_io.recursive_create_dir(args.output_dir) if args.cloud: run_cloud_analysis( output_dir=args.output_dir, csv_file_pattern=args.csv_file_pattern, bigquery_table=args.bigquery_table, schema=schema, features=features) else: run_local_analysis( output_dir=args.output_dir, csv_file_pattern=args.csv_file_pattern, schema=schema, features=features) # Also writes the transform fn and tft metadata. make_transform_graph(args.output_dir, schema, features) # Save a copy of the schema and features in the output folder. file_io.write_string_to_file( os.path.join(args.output_dir, SCHEMA_FILE), json.dumps(schema, indent=2)) file_io.write_string_to_file( os.path.join(args.output_dir, FEATURES_FILE), json.dumps(features, indent=2))
def _get_table(name): """ Given a variable or table name, get a Table if it exists. Args: name: the name of the Table or a variable referencing the Table. Returns: The Table, if found. """ # If name is a variable referencing a table, use that. item = google.datalab.utils.commands.get_notebook_item(name) if isinstance(item, bigquery.Table): return item # Else treat this as a BQ table name and return the (cached) table if it exists. try: return _existing_table_cache[name] except KeyError: table = bigquery.Table(name) if table.exists(): _existing_table_cache[name] = table return table return None
def test_get_table(self, mock_get_notebook_item, mock_table_exists, mock_get_credentials, mock_default_context): # test bad name mock_get_notebook_item.return_value = None mock_table_exists.return_value = False t = bq.commands._bigquery._get_table('bad.name') self.assertIsNone(t) # test good table name test_table_name = 'testproject.test.table' mock_get_notebook_item.return_value = bq.Table(test_table_name) t = bq.commands._bigquery._get_table(test_table_name) self.assertEqual(t.full_name, test_table_name) # test table name reference mock_get_notebook_item.return_value = test_table_name mock_table_exists.return_value = True t = bq.commands._bigquery._get_table(test_table_name) self.assertEqual(t.full_name, test_table_name) self.assertIn(test_table_name, bq.commands._bigquery._existing_table_cache)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.schema: schema = json.loads( file_io.read_file_to_string(args.schema).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features).decode()) expand_defaults(schema, features) # features are updated. inverted_features = invert_features(features) check_schema_transforms_match(schema, inverted_features) file_io.recursive_create_dir(args.output) if args.cloud: run_cloud_analysis( output_dir=args.output, csv_file_pattern=args.csv, bigquery_table=args.bigquery, schema=schema, inverted_features=inverted_features) else: run_local_analysis( output_dir=args.output, csv_file_pattern=args.csv, schema=schema, inverted_features=inverted_features) # Save a copy of the schema and features in the output folder. file_io.write_string_to_file( os.path.join(args.output, constant.SCHEMA_FILE), json.dumps(schema, indent=2)) file_io.write_string_to_file( os.path.join(args.output, constant.FEATURES_FILE), json.dumps(features, indent=2))
def BigQuery_exportation(df, bigquery_dataset_name, bigquery_table_name): print('\nBigQuery exportation started ...') start_time = time() #Export vers BigQuery bigquery_dataset_name = bigquery_dataset_name bigquery_table_name = bigquery_table_name # Define BigQuery dataset and table dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) # Create or overwrite the existing table if it exists table_schema = bq.Schema.from_data(df) table.create(schema=table_schema, overwrite=True) # Write the DataFrame to a BigQuery table table.insert(df) print( 'BigQuery Exportation Finished. \nTotal exportation time = {:0.2f} min' .format((time() - start_time) / 60))
#---------------------- retravail de la classification avec les règles en dur ------------------------- #Application de ces règles au dataframe Final_predicted_df['FLAG_RUPTURE'] = Final_predicted_df.apply( hard_coded_rules.flag_rupture, axis=1) Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply( hard_coded_rules.rejet_cause_prev, axis=1) Final_predicted_df['Class_Prediction'] = Final_predicted_df.apply( hard_coded_rules.flag_livraison, axis=1) #--------------------- BigQuery Exportation ---------------------------- print('Export to BigQuery table...') start_time = time() #Export vers BigQuery bigquery_dataset_name = 'electric-armor-213817.Donnees_journalieres' bigquery_table_name = 'Classification_journaliere' # Define BigQuery dataset and table dataset = bq.Dataset(bigquery_dataset_name) table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name) # Create or overwrite the existing table if it exists table_schema = bq.Schema.from_data(Final_predicted_df) table.create(schema=table_schema, overwrite=True) # Write the DataFrame to a BigQuery table table.insert(Final_predicted_df) print('BigQuery export finished. \nExporting process took {:0.2f}min'.format( (time() - start_time) / 60))
def _table_cell(args, cell_body): """Implements the BigQuery table magic subcommand used to operate on tables The supported syntax is: %%bq tables <command> <args> Commands: {list, create, delete, describe, view} Args: args: the optional arguments following '%%bq tables command'. cell_body: optional contents of the cell interpreted as SQL, YAML or JSON. Returns: The HTML rendering for the table of datasets. """ if args['command'] == 'list': filter_ = args['filter'] if args['filter'] else '*' if args['dataset']: if args['project'] is None: datasets = [bigquery.Dataset(args['dataset'])] else: context = google.datalab.Context(args['project'], google.datalab.Context.default().credentials) datasets = [bigquery.Dataset(args['dataset'], context)] else: default_context = google.datalab.Context.default() context = google.datalab.Context(default_context.project_id, default_context.credentials) if args['project']: context.set_project_id(args['project']) datasets = bigquery.Datasets(context) tables = [] for dataset in datasets: tables.extend([table.full_name for table in dataset if fnmatch.fnmatch(table.full_name, filter_)]) return _render_list(tables) elif args['command'] == 'create': if cell_body is None: print('Failed to create %s: no schema specified' % args['name']) else: try: record = google.datalab.utils.commands.parse_config( cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False) jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(record['schema']) bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite']) except Exception as e: print('Failed to create table %s: %s' % (args['name'], e)) elif args['command'] == 'describe': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) html = _repr_html_table_schema(table.schema) return IPython.core.display.HTML(html) elif args['command'] == 'delete': try: bigquery.Table(args['name']).delete() except Exception as e: print('Failed to delete table %s: %s' % (args['name'], e)) elif args['command'] == 'view': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) return table
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" # Make a BQ table, and insert 1 row. try: bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex bucket_root = 'gs://%s' % bucket_name bucket = storage.Bucket(bucket_name) bucket.create() project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'key_col', 'type': 'INTEGER'}, {'name': 'target_col', 'type': 'FLOAT'}, {'name': 'cat_col', 'type': 'STRING'}, {'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) img1_file = os.path.join(self.source_dir, 'img1.jpg') dest_file = os.path.join(bucket_root, 'img1.jpg') file_io.copy(img1_file, dest_file) data = [ { 'key_col': 1, 'target_col': 1.0, 'cat_col': 'Monday', 'num_col': 23.0, 'img_col': dest_file, }, ] table.insert(data=data) cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'), '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name), '--analysis=' + self.analysis_dir, '--prefix=features', '--project-id=' + project_id, '--output=' + self.output_dir] print('cmd ', ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options)) self.assertEqual(len(serialized_examples), 1) example = tf.train.Example() example.ParseFromString(serialized_examples[0]) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 23.0) transformed_category = example.features.feature['cat_col'].int64_list.value[0] self.assertEqual(transformed_category, 2) image_bytes = example.features.feature['img_col'].float_list.value self.assertEqual(len(image_bytes), 2048) self.assertTrue(any(x != 0.0 for x in image_bytes)) finally: dataset.delete(delete_contents=True) for obj in bucket.objects(): obj.delete() bucket.delete()