def run_analysis(args): """Builds an analysis file for training. Uses BiqQuery tables to do the analysis. Args: args: command line args Raises: ValueError if schema contains unknown types. """ import google.datalab.bigquery as bq if args.bigquery_table: table = bq.Table(args.bigquery_table) schema_list = table.schema._bq_schema else: schema_list = json.loads( file_io.read_file_to_string(args.schema_file).decode()) table = bq.ExternalDataSource(source=args.input_file_pattern, schema=bq.Schema(schema_list)) # Check the schema is supported. for col_schema in schema_list: col_type = col_schema['type'].lower() if col_type != 'string' and col_type != 'integer' and col_type != 'float': raise ValueError('Schema contains an unsupported type %s.' % col_type) run_numerical_analysis(table, schema_list, args) run_categorical_analysis(table, schema_list, args) # Save a copy of the schema to the output location. file_io.write_string_to_file( os.path.join(args.output_dir, SCHEMA_FILE), json.dumps(schema_list, indent=2, separators=(',', ': ')))
def _datasource_cell(args, cell_body): """Implements the BigQuery datasource cell magic for ipython notebooks. The supported syntax is %%bq datasource --name <var> --paths <url> [--format <CSV|JSON>] <schema> Args: args: the optional arguments following '%%bq datasource' cell_body: the datasource's schema in json/yaml """ name = args['name'] paths = args['paths'] data_format = (args['format'] or 'CSV').lower() compressed = args['compressed'] or False # Get the source schema from the cell body record = google.datalab.utils.commands.parse_config( cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False) jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(record['schema']) # Finally build the datasource object datasource = bigquery.ExternalDataSource(source=paths, source_format=data_format, compressed=compressed, schema=schema) google.datalab.utils.commands.notebook_environment()[name] = datasource
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features={ 'col1': { 'transform': 'scale' }, 'col2': { 'transform': 'identity' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def _load_cell(args, cell_body): """Implements the BigQuery load magic used to load data from GCS to a table. The supported syntax is: %bq load <optional args> Args: args: the arguments following '%bq load'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: A message about whether the load succeeded or failed. """ name = args['table'] table = _get_table(name) if not table: table = bigquery.Table(name) if args['mode'] == 'create': if table.exists(): raise Exception( 'table %s already exists; use "append" or "overwrite" as mode.' % name) if not cell_body or 'schema' not in cell_body: raise Exception( 'Table does not exist, and no schema specified in cell; cannot load.' ) env = google.datalab.utils.commands.notebook_environment() config = google.datalab.utils.commands.parse_config( cell_body, env, False) schema = config['schema'] # schema can be an instance of bigquery.Schema. # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and # specify "schema: $my_schema" in cell input. if not isinstance(schema, bigquery.Schema): jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(schema) table.create(schema=schema) elif not table.exists(): raise Exception('table %s does not exist; use "create" as mode.' % name) csv_options = bigquery.CSVOptions(delimiter=args['delimiter'], skip_leading_rows=args['skip'], allow_jagged_rows=not args['strict'], quote=args['quote']) job = table.load(args['path'], mode=args['mode'], source_format=args['format'], csv_options=csv_options, ignore_unknown_values=not args['strict']) if job.failed: raise Exception('Load failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Load completed with errors: %s' % str(job.errors))
def test_table_cell_create(self, mock_table, mock_default_context): args = {'command': 'create', 'name': 'test-table', 'overwrite': None} cell_body = { 'schema': [ {'name': 'col1', 'type': 'int64', 'mode': 'NULLABLE', 'description': 'description1'}, {'name': 'col1', 'type': 'STRING', 'mode': 'required', 'description': 'description1'} ] } bq.commands._bigquery._table_cell(args, json.dumps(cell_body)) call_kwargs = mock_table.return_value.create.call_args[1] self.assertEqual(None, call_kwargs['overwrite']) self.assertEqual(bq.Schema(cell_body['schema']), call_kwargs['schema'])
def _create_external_data_source(self, skip_header_rows): import google.datalab.bigquery as bq df = self.browse(1, None) # read each column as STRING because we only want to sample rows. schema_train = bq.Schema([{ 'name': name, 'type': 'STRING' } for name in df.keys()]) options = bq.CSVOptions( skip_leading_rows=(1 if skip_header_rows == True else 0)) return bq.ExternalDataSource(self.path, csv_options=options, schema=schema_train, max_bad_records=0)
def test_table_cell_describe(self, mock_get_table, mock_default_context): args = {'command': 'describe', 'name': 'test-table', 'overwrite': None} mock_get_table.return_value = None with self.assertRaisesRegexp(Exception, 'Could not find table'): bq.commands._bigquery._table_cell(args, None) mock_get_table.return_value = bq.Table('project.test.table') schema = bq.Schema([{'name': 'col1', 'type': 'string'}]) mock_get_table.return_value._schema = schema rendered = bq.commands._bigquery._table_cell(args, None) expected_html1 = 'bq.renderSchema(dom, [{"type": "string", "name": "col1"}]);' expected_html2 = 'bq.renderSchema(dom, [{"name": "col1", "type": "string"}]);' self.assertTrue(expected_html1 in rendered or expected_html2 in rendered)
def execute(self, context): if self.data_source: kwargs = {} if self.csv_options: csv_kwargs = {} if 'delimiter' in self.csv_options: csv_kwargs['delimiter'] = self.csv_options['delimiter'] if 'skip' in self.csv_options: csv_kwargs['skip_leading_rows'] = self.csv_options['skip'] if 'strict' in self.csv_options: csv_kwargs['allow_jagged_rows'] = self.csv_options[ 'strict'] if 'quote' in self.csv_options: csv_kwargs['quote'] = self.csv_options['quote'] kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs) if self.format: kwargs['source_format'] = self.format if self.max_bad_records: kwargs['max_bad_records'] = self.max_bad_records external_data_source = bq.ExternalDataSource(source=self.path, schema=bq.Schema( self.schema), **kwargs) query = bq.Query( sql=self.sql, data_sources={self.data_source: external_data_source}) else: query = bq.Query(sql=self.sql) # use_cache is False since this is most likely the case in pipeline scenarios # allow_large_results can be True only if table is specified (i.e. when it's not None) kwargs = {} if self.mode is not None: kwargs['mode'] = self.mode output_options = bq.QueryOutput.table(name=self.table, use_cache=False, allow_large_results=self.table is not None, **kwargs) query_params = bq.Query.get_query_parameters(self.parameters) job = query.execute(output_options=output_options, query_params=query_params) # Returning the table-name here makes it available for downstream task instances. return {'table': job.result().full_name}
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema, inverted_features): """Use BigQuery to analyze input date. Only one of csv_file_pattern or bigquery_table should be non-None. Args: output_dir: output folder csv_file_pattern: list of csv file paths, may contain wildcards bigquery_table: project_id.dataset_name.table_name schema: schema list inverted_features: inverted_features dict """ def _execute_sql(sql, table): """Runs a BigQuery job and dowloads the results into local memeory. Args: sql: a SQL string table: bq.ExternalDataSource or bq.Table Returns: A Pandas dataframe. """ import google.datalab.bigquery as bq if isinstance(table, bq.ExternalDataSource): query = bq.Query(sql, data_sources={'csv_table': table}) else: query = bq.Query(sql) return query.execute().result().to_dataframe() import google.datalab.bigquery as bq if bigquery_table: table_name = '`%s`' % bigquery_table table = None else: table_name = 'csv_table' table = bq.ExternalDataSource( source=csv_file_pattern, schema=bq.Schema(schema)) # Make a copy of inverted_features and update the target transform to be # identity or one hot depending on the schema. inverted_features_target = copy.deepcopy(inverted_features) for name, transform_set in six.iteritems(inverted_features_target): if transform_set == set([constant.TARGET_TRANSFORM]): target_schema = next(col['type'].lower() for col in schema if col['name'] == name) if target_schema in constant.NUMERIC_SCHEMA: inverted_features_target[name] = {constant.IDENTITY_TRANSFORM} else: inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM} numerical_vocab_stats = {} for col_name, transform_set in six.iteritems(inverted_features_target): sys.stdout.write('Analyzing column %s...' % col_name) sys.stdout.flush() # All transforms in transform_set require the same analysis. So look # at the first transform. transform_name = next(iter(transform_set)) if (transform_name in constant.CATEGORICAL_TRANSFORMS or transform_name in constant.TEXT_TRANSFORMS): if transform_name in constant.TEXT_TRANSFORMS: # Split strings on space, then extract labels and how many rows each # token is in. This is done by making two temp tables: # SplitTable: each text row is made into an array of strings. The # array may contain repeat tokens # TokenTable: SplitTable with repeated tokens removed per row. # Then to flatten the arrays, TokenTable has to be joined with itself. # See the sections 'Flattening Arrays' and 'Filtering Arrays' at # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays sql = ('WITH SplitTable AS ' ' (SELECT SPLIT({name}, \' \') as token_array FROM {table}), ' ' TokenTable AS ' ' (SELECT ARRAY(SELECT DISTINCT x ' ' FROM UNNEST(token_array) AS x) AS unique_tokens_per_row ' ' FROM SplitTable) ' 'SELECT token, COUNT(token) as token_count ' 'FROM TokenTable ' 'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token ' 'WHERE LENGTH(token) > 0 ' 'GROUP BY token ' 'ORDER BY token_count DESC, token ASC').format(name=col_name, table=table_name) else: # Extract label and frequency sql = ('SELECT {name} as token, count(*) as count ' 'FROM {table} ' 'WHERE {name} IS NOT NULL ' 'GROUP BY {name} ' 'ORDER BY count DESC, token ASC').format(name=col_name, table=table_name) df = _execute_sql(sql, table) # Save the vocab string_buff = six.StringIO() df.to_csv(string_buff, index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % col_name), string_buff.getvalue()) numerical_vocab_stats[col_name] = {'vocab_size': len(df)} # free memeory del string_buff del df elif transform_name in constant.NUMERIC_TRANSFORMS: # get min/max/average sql = ('SELECT max({name}) as max_value, min({name}) as min_value, ' 'avg({name}) as avg_value from {table}').format(name=col_name, table=table_name) df = _execute_sql(sql, table) numerical_vocab_stats[col_name] = {'min': df.iloc[0]['min_value'], 'max': df.iloc[0]['max_value'], 'mean': df.iloc[0]['avg_value']} sys.stdout.write('done.\n') sys.stdout.flush() # get num examples sql = 'SELECT count(*) as num_examples from {table}'.format(table=table_name) df = _execute_sql(sql, table) num_examples = df.iloc[0]['num_examples'] # Write the stats file. stats = {'column_stats': numerical_vocab_stats, 'num_examples': num_examples} file_io.write_string_to_file( os.path.join(output_dir, constant.STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))
def _table_cell(args, cell_body): """Implements the BigQuery table magic subcommand used to operate on tables The supported syntax is: %%bq tables <command> <args> Commands: {list, create, delete, describe, view} Args: args: the optional arguments following '%%bq tables command'. cell_body: optional contents of the cell interpreted as SQL, YAML or JSON. Returns: The HTML rendering for the table of datasets. """ if args['command'] == 'list': filter_ = args['filter'] if args['filter'] else '*' if args['dataset']: if args['project'] is None: datasets = [bigquery.Dataset(args['dataset'])] else: context = google.datalab.Context(args['project'], google.datalab.Context.default().credentials) datasets = [bigquery.Dataset(args['dataset'], context)] else: default_context = google.datalab.Context.default() context = google.datalab.Context(default_context.project_id, default_context.credentials) if args['project']: context.set_project_id(args['project']) datasets = bigquery.Datasets(context) tables = [] for dataset in datasets: tables.extend([table.full_name for table in dataset if fnmatch.fnmatch(table.full_name, filter_)]) return _render_list(tables) elif args['command'] == 'create': if cell_body is None: print('Failed to create %s: no schema specified' % args['name']) else: try: record = google.datalab.utils.commands.parse_config( cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False) jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(record['schema']) bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite']) except Exception as e: print('Failed to create table %s: %s' % (args['name'], e)) elif args['command'] == 'describe': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) html = _repr_html_table_schema(table.schema) return IPython.core.display.HTML(html) elif args['command'] == 'delete': try: bigquery.Table(args['name']).delete() except Exception as e: print('Failed to delete table %s: %s' % (args['name'], e)) elif args['command'] == 'view': name = args['name'] table = _get_table(name) if not table: raise Exception('Could not find table %s' % name) return table
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema, features): """Use BigQuery to analyze input date. Only one of csv_file_pattern or bigquery_table should be non-None. Args: output_dir: output folder csv_file_pattern: csv file path, may contain wildcards bigquery_table: project_id.dataset_name.table_name schema: schema list features: features dict """ def _execute_sql(sql, table): """Runs a BigQuery job and dowloads the results into local memeory. Args: sql: a SQL string table: bq.ExternalDataSource or bq.Table Returns: A Pandas dataframe. """ import google.datalab.bigquery as bq if isinstance(table, bq.ExternalDataSource): query = bq.Query(sql, data_sources={'csv_table': table}) else: query = bq.Query(sql) return query.execute().result().to_dataframe() import google.datalab.bigquery as bq if bigquery_table: table_name = '`%s`' % bigquery_table table = None else: table_name = 'csv_table' table = bq.ExternalDataSource(source=csv_file_pattern, schema=bq.Schema(schema)) numerical_vocab_stats = {} for col_schema in schema: col_name = col_schema['name'] col_type = col_schema['type'].lower() transform = features[col_name]['transform'] # Map the target transfrom into one_hot or identity. if transform == TARGET_TRANSFORM: if col_type == STRING_SCHEMA: transform = ONE_HOT_TRANSFORM elif col_type in NUMERIC_SCHEMA: transform = IDENTITY_TRANSFORM else: raise ValueError('Unknown schema type') if transform in (TEXT_TRANSFORMS + CATEGORICAL_TRANSFORMS): if transform in TEXT_TRANSFORMS: # Split strings on space, then extract labels and how many rows each # token is in. This is done by making two temp tables: # SplitTable: each text row is made into an array of strings. The # array may contain repeat tokens # TokenTable: SplitTable with repeated tokens removed per row. # Then to flatten the arrays, TokenTable has to be joined with itself. # See the sections 'Flattening Arrays' and 'Filtering Arrays' at # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays sql = ( 'WITH SplitTable AS ' ' (SELECT SPLIT({name}, \' \') as token_array FROM {table}), ' ' TokenTable AS ' ' (SELECT ARRAY(SELECT DISTINCT x ' ' FROM UNNEST(token_array) AS x) AS unique_tokens_per_row ' ' FROM SplitTable) ' 'SELECT token, COUNT(token) as token_count ' 'FROM TokenTable ' 'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token ' 'WHERE LENGTH(token) > 0 ' 'GROUP BY token ' 'ORDER BY token_count DESC, token ASC').format( name=col_name, table=table_name) else: # Extract label and frequency sql = ('SELECT {name} as token, count(*) as count ' 'FROM {table} ' 'WHERE {name} IS NOT NULL ' 'GROUP BY {name} ' 'ORDER BY count DESC, token ASC').format( name=col_name, table=table_name) df = _execute_sql(sql, table) # Save the vocab string_buff = six.StringIO() df.to_csv(string_buff, index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, VOCAB_ANALYSIS_FILE % col_name), string_buff.getvalue()) numerical_vocab_stats[col_name] = {'vocab_size': len(df)} # free memeory del string_buff del df elif transform in NUMERIC_TRANSFORMS: # get min/max/average sql = ( 'SELECT max({name}) as max_value, min({name}) as min_value, ' 'avg({name}) as avg_value from {table}').format( name=col_name, table=table_name) df = _execute_sql(sql, table) numerical_vocab_stats[col_name] = { 'min': df.iloc[0]['min_value'], 'max': df.iloc[0]['max_value'], 'mean': df.iloc[0]['avg_value'] } elif transform == IMAGE_TRANSFORM: pass elif transform == KEY_TRANSFORM: pass else: raise ValueError('Unknown transform %s' % transform) # get num examples sql = 'SELECT count(*) as num_examples from {table}'.format( table=table_name) df = _execute_sql(sql, table) num_examples = df.iloc[0]['num_examples'] # Write the stats file. stats = { 'column_stats': numerical_vocab_stats, 'num_examples': num_examples } file_io.write_string_to_file( os.path.join(output_dir, STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))