def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date':'3000-12-31', 'time':'23:59:59.990000'}, {'bytes':b'\xab\xac\xad', 'date':'2000-01-01', 'time':'00:00:00'} ] # the API Tools bigquery client expects byte values to be base-64 encoded # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require # handling the encoding in beam for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)
def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw=', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc=', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'dec=', 'date': '3000-12-31', 'time': '23:59:59.990000' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)
def test_add_format_fields_reserved_field(self): calls_record_with_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record_with_desc.fields.append( bigquery.TableFieldSchema( name='GQ', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc')) formats = OrderedDict() schema_converter._add_format_fields(calls_record_with_desc, formats) expected_formats = OrderedDict([('GQ', createFormat('GQ', 1, 'Integer', 'bigquery desc'))]) self.assertEqual(formats, expected_formats) calls_record_without_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record_without_desc.fields.append( bigquery.TableFieldSchema( name='GQ', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='')) formats = OrderedDict() schema_converter._add_format_fields(calls_record_without_desc, formats) expected_formats = OrderedDict([ ('GQ', createFormat('GQ', 1, 'Integer', 'Conditional genotype quality')) ]) self.assertEqual(formats, expected_formats)
def test_simple_schemas(self): schema1 = bigquery.TableSchema(fields=[]) self.assertTrue(check_schema_equal(schema1, schema1)) schema2 = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64") ]) self.assertTrue(check_schema_equal(schema2, schema2)) self.assertFalse(check_schema_equal(schema1, schema2)) schema3 = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name="b", mode="REPEATED", type="RECORD", fields=[ bigquery.TableFieldSchema( name="c", mode="REQUIRED", type="BOOL") ]) ]) self.assertTrue(check_schema_equal(schema3, schema3)) self.assertFalse(check_schema_equal(schema2, schema3))
def test_add_info_fields_reserved_field(self): field_with_desc = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc') infos = OrderedDict() schema_converter._add_info_fields(field_with_desc, infos) expected_infos = OrderedDict([('AA', Info('AA', 1, 'String', 'bigquery desc', None, None))]) self.assertEqual(infos, expected_infos) field_without_desc = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='') infos = OrderedDict() schema_converter._add_info_fields(field_without_desc, infos) expected_infos = OrderedDict([('AA', Info('AA', 1, 'String', 'Ancestral allele', None, None))]) self.assertEqual(infos, expected_infos)
def test_add_info_fields_from_alternate_bases_schema_compatibility(self): schema_conflict_info = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') schema_conflict_info.fields.append( bigquery.TableFieldSchema( name='AF', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc')) with self.assertRaises(ValueError): schema_converter._add_info_fields(schema_conflict_info, OrderedDict()) infos_allow_incompatible_schema = OrderedDict() schema_converter._add_info_fields(schema_conflict_info, infos_allow_incompatible_schema, allow_incompatible_schema=True) expected_infos = OrderedDict([('AF', Info('AF', field_counts['A'], 'Integer', 'desc', None, None))]) self.assertEqual(infos_allow_incompatible_schema, expected_infos)
def test_get_or_create_table_intermittent_exception(self): client = mock.Mock() client.tables.Insert.side_effect = [ HttpError(response={'status': '408'}, url='', content=''), 'table_id' ] client.tables.Get.side_effect = [None, 'table_id'] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) new_table = wrapper.get_or_create_table( 'project_id', 'dataset_id', 'table_id', bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False) self.assertEqual(new_table, 'table_id')
def test_get_beam_bigquery_schema(self) -> None: """Test making a bigquery schema for beam's table writing.""" test_field = { 'string_field': ('string', 'nullable'), 'int_field': ('integer', 'repeated'), } table_schema = beam_tables._get_beam_bigquery_schema(test_field) expected_field_schema_1 = beam_bigquery.TableFieldSchema() expected_field_schema_1.name = 'string_field' expected_field_schema_1.type = 'string' expected_field_schema_1.mode = 'nullable' expected_field_schema_2 = beam_bigquery.TableFieldSchema() expected_field_schema_2.name = 'int_field' expected_field_schema_2.type = 'integer' expected_field_schema_2.mode = 'repeated' expected_table_schema = beam_bigquery.TableSchema() expected_table_schema.fields.append(expected_field_schema_1) expected_table_schema.fields.append(expected_field_schema_2) self.assertEqual(table_schema, expected_table_schema)
def test_write_row(self): schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name='stamp', type='TIMESTAMP'), bigquery.TableFieldSchema( name='number', type='FLOAT', mode='REQUIRED'), ]) stamp = datetime.datetime(2020, 2, 25, 12, 0, 0, tzinfo=pytz.utc) with io.BytesIO() as buf: # Mock close() so we can access the buffer contents # after AvroRowWriter is closed. with mock.patch.object(buf, 'close') as mock_close: writer = AvroRowWriter(buf, schema) writer.write({'stamp': stamp, 'number': float('NaN')}) writer.close() mock_close.assert_called_once() buf.seek(0) records = [r for r in fastavro.reader(buf)] self.assertEqual(len(records), 1) self.assertTrue(math.isnan(records[0]['number'])) self.assertEqual(records[0]['stamp'], stamp)
def create_schema(fields): table_schema = bigquery.TableSchema() for field in fields: tmp_schema = bigquery.TableFieldSchema() tmp_schema.name = field tmp_schema.mode = "nullable" if field == "cookies": tmp_schema.type = "STRING" elif field == "visitDate": tmp_schema.type = "DATE" else: tmp_schema.type = "INTEGER" table_schema.fields.append(tmp_schema) return table_schema
def test_get_or_create_table(self): client = mock.Mock() client.tables.Insert.return_value = 'table_id' client.tables.Get.side_effect = [None, 'table_id'] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) new_table = wrapper.get_or_create_table( 'project_id', 'dataset_id', 'table_id', bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False) self.assertEqual(new_table, 'table_id')
def test_generate_header_fields_from_schema_invalid_description(self): schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name='invalid_description', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Desc\nThis is added intentionally.')) header = schema_converter.generate_header_fields_from_schema(schema) infos = OrderedDict([('invalid_description', Info('invalid_description', 1, 'String', 'Desc This is added intentionally.', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def test_row_as_table_row(self): schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN'), ('n', 'NUMERIC'), ('r', 'RECORD'), ('g', 'GEOGRAPHY')] data_definition = [ 'abc', 123, 123.456, True, decimal.Decimal('987654321.987654321'), { 'a': 'b' }, 'LINESTRING(1 2, 3 4, 5 6, 7 8)' ] str_def = ( '{"s": "abc", ' '"i": 123, ' '"f": 123.456, ' '"b": true, ' '"n": "987654321.987654321", ' '"r": {"a": "b"}, ' '"g": "LINESTRING(1 2, 3 4, 5 6, 7 8)"}') schema = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition ]) coder = TableRowJsonCoder(table_schema=schema) def value_or_decimal_to_json(val): if isinstance(val, decimal.Decimal): return to_json_value(str(val)) else: return to_json_value(val) test_row = bigquery.TableRow( f=[ bigquery.TableCell(v=value_or_decimal_to_json(e)) for e in data_definition ]) self.assertEqual(str_def, coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual( test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_get_annotation_names_multiple_annotations(self): schema = bigquery.TableSchema() alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') annotation_record_1 = bigquery.TableFieldSchema( name='CSQ_1', type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='desc') annotation_record_1.fields.append( bigquery.TableFieldSchema( name='allele', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) annotation_record_1.fields.append( bigquery.TableFieldSchema( name='Consequence', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) alternate_bases_record.fields.append(annotation_record_1) annotation_record_2 = bigquery.TableFieldSchema( name='CSQ_2', type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='desc') annotation_record_2.fields.append( bigquery.TableFieldSchema( name='allele', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) annotation_record_2.fields.append( bigquery.TableFieldSchema( name='IMPACT', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) alternate_bases_record.fields.append(annotation_record_2) schema.fields.append(alternate_bases_record) self.assertEqual(bq_to_vcf._extract_annotation_names(schema), { 'CSQ_1': ['allele', 'Consequence'], 'CSQ_2': ['allele', 'IMPACT'] })
def build_bq_schema(): table_schema = bigquery.TableSchema() text_field = bigquery.TableFieldSchema() text_field.name = 'text' text_field.type = 'string' text_field.mode = 'nullable' table_schema.fields.append(text_field) created_at_field = bigquery.TableFieldSchema() created_at_field.name = 'created_at' created_at_field.type = 'datetime' created_at_field.mode = 'nullable' table_schema.fields.append(created_at_field) sentiment_field = bigquery.TableFieldSchema() sentiment_field.name = 'sentiment' sentiment_field.type = 'integer' sentiment_field.mode = 'nullable' table_schema.fields.append(sentiment_field) # nested field job_field = bigquery.TableFieldSchema() job_field.name = 'job' job_field.type = 'record' job_field.mode = 'nullable' job_id_field = bigquery.TableFieldSchema() job_id_field.name = 'job_id' job_id_field.type = 'string' job_id_field.mode = 'nullable' job_field.fields.append(job_id_field) query_field = bigquery.TableFieldSchema() query_field.name = 'query' query_field.type = 'string' query_field.mode = 'nullable' job_field.fields.append(query_field) created_at_job_field = bigquery.TableFieldSchema() created_at_job_field.name = 'created_at' created_at_job_field.type = 'datetime' created_at_job_field.mode = 'nullable' job_field.fields.append(created_at_job_field) table_schema.fields.append(job_field) return table_schema
def _json_to_table_schema(self, from_json, child_schema): """ Recursively converts a json object to a BigQuery TableFieldSchema Args: from_json: the json object to convert Returns: A TableFieldSchema that can be appened to an existing TableFieldSchema """ for k, v in from_json.items(): no = bigquery.TableFieldSchema() no.name = k no.type = "string" no.mode = 'nullable' if isinstance(v, dict): no.type = "record" self._json_to_table_schema(v, no) child_schema.fields.append(no) return child_schema
def test_merge_field_schemas_merge_record_fields(self): call_record_1 = bigquery.TableFieldSchema( name=ColumnKeyConstants.CALLS, type=TableFieldConstants.TYPE_RECORD, mode=TableFieldConstants.MODE_REPEATED, description='One record for each call.') call_record_1.fields.append(bigquery.TableFieldSchema( name='FB', type=TableFieldConstants.TYPE_BOOLEAN, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) field_schemas_1 = [call_record_1] call_record_2 = bigquery.TableFieldSchema( name=ColumnKeyConstants.CALLS, type=TableFieldConstants.TYPE_RECORD, mode=TableFieldConstants.MODE_REPEATED, description='One record for each call.') call_record_2.fields.append(bigquery.TableFieldSchema( name='GQ', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) field_schemas_2 = [call_record_2] call_record_3 = bigquery.TableFieldSchema( name=ColumnKeyConstants.CALLS, type=TableFieldConstants.TYPE_RECORD, mode=TableFieldConstants.MODE_REPEATED, description='One record for each call.') call_record_3.fields.append(bigquery.TableFieldSchema( name='FB', type=TableFieldConstants.TYPE_BOOLEAN, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) call_record_3.fields.append(bigquery.TableFieldSchema( name='GQ', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) expected_merged_field_schemas = [call_record_3] self.assertEqual( bigquery_util._get_merged_field_schemas(field_schemas_1, field_schemas_2), expected_merged_field_schemas)
def runner_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='selection_id', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='market_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='event_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='sort_priority', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema( name='handicap', type='FLOAT', mode='REQUIRED'), ])
def test_row_as_table_row(self): schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN'), ('r', 'RECORD')] data_defination = ['abc', 123, 123.456, True, {'a': 'b'}] str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}' schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition ]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value(e)) for e in data_defination ]) self.assertEqual(str_def, coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual(test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_generate_header_fields_from_schema_schema_compatibility(self): schema_conflict = bigquery.TableSchema() schema_conflict.fields.append( bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc')) with self.assertRaises(ValueError): bigquery_vcf_schema_converter.generate_header_fields_from_schema( schema_conflict) header = bigquery_vcf_schema_converter.generate_header_fields_from_schema( schema_conflict, allow_incompatible_schema=True) infos = OrderedDict([('AA', Info('AA', 1, 'Integer', 'desc', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def parseSchema(self): """ Reads the schema of the given rows using row2 as a check for hidden json strings. Args: Returns: A tuple (json_columns, table_schema) where - json_columns is a list of the columns [{'name':<COLUMN_NAME>, 'is_json':<CONTAINS_A_JSON_STRING>}] - table_schema is the actual bigquery.TableSchema """ column_names = self.row1.split(self.delimiter) for idx, cn in enumerate(column_names): column_schema = bigquery.TableFieldSchema() column_schema.name = cn column_schema.type = "string" column_schema.mode = 'nullable' is_json = False if self.row2 != "": data_values = re.findall( r'(?:[^\s' + self.delimiter + '"]|"(?:\\.|[^"])*"|(?<=,))+', self.row2) check_json = self._is_json( data_values[idx].decode('string-escape').strip('"')) if check_json: # Update the schema type and mark the column as json column_schema.type = 'record' is_json = True # Converts JSON to TableFieldSchema and append it to the column nested_schema = self._json_to_table_schema( check_json, column_schema) # Appends column_schema = nested_schema # Add the column name to the list and whether it is json or not self.json_columns.append({'name': cn, 'is_json': is_json}) # Add the field to the schema self.table_schema.fields.append(column_schema) return self.json_columns, self.getSchema()
def expand(self, pcoll): table_schema = bigquery.TableSchema() head_schema = bigquery.TableFieldSchema() head_schema.name = 'head' head_schema.type = 'string' head_schema.mode = 'nullable' table_schema.fields.append(head_schema) head_type_schema = bigquery.TableFieldSchema() head_type_schema.name = 'head_type' head_type_schema.type = 'string' head_type_schema.mode = 'nullable' table_schema.fields.append(head_type_schema) relation_schema = bigquery.TableFieldSchema() relation_schema.name = 'relation' relation_schema.type = 'string' relation_schema.mode = 'nullable' table_schema.fields.append(relation_schema) tail_schema = bigquery.TableFieldSchema() tail_schema.name = 'tail' tail_schema.type = 'string' tail_schema.mode = 'nullable' table_schema.fields.append(tail_schema) tail_type_schema = bigquery.TableFieldSchema() tail_type_schema.name = 'tail_type' tail_type_schema.type = 'string' tail_type_schema.mode = 'nullable' table_schema.fields.append(tail_type_schema) sentence_schema = bigquery.TableFieldSchema() sentence_schema.name = 'sentence' sentence_schema.type = 'string' sentence_schema.mode = 'nullable' table_schema.fields.append(sentence_schema) return ( pcoll | 'Parse the json lines' >> beam.ParDo(Parse_json()) | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( table='relation_extraction_data', dataset=DATASET_ID, project=PROJECT_ID, schema=table_schema, # Pass the defined table_schema create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def test_merge_field_schemas_same_id_no_conflicts(self): field_schemas_1 = [ bigquery.TableFieldSchema( name='II', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc'), bigquery.TableFieldSchema( name='IFR', type=TableFieldConstants.TYPE_FLOAT, mode=TableFieldConstants.MODE_REPEATED, description='INFO foo desc') ] field_schemas_2 = [ bigquery.TableFieldSchema( name='II', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc'), bigquery.TableFieldSchema( name='AB', type=TableFieldConstants.TYPE_FLOAT, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc') ] merged_field_schemas = bigquery_util._get_merged_field_schemas( field_schemas_1, field_schemas_2) expected_merged_field_schemas = [ bigquery.TableFieldSchema( name='II', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc'), bigquery.TableFieldSchema( name='IFR', type=TableFieldConstants.TYPE_FLOAT, mode=TableFieldConstants.MODE_REPEATED, description='INFO foo desc'), bigquery.TableFieldSchema( name='AB', type=TableFieldConstants.TYPE_FLOAT, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc') ] self.assertEqual(merged_field_schemas, expected_merged_field_schemas)
def event_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='timezone', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='market_count', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema(name='open_date_str', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='country_code', type='STRING', mode='REQUIRED'), ])
def test_get_or_create_table_invalid_tablename(self): invalid_names = ['big-query', 'table name', 'a' * 1025] for table_id in invalid_names: client = mock.Mock() client.tables.Get.side_effect = [None] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) self.assertRaises( ValueError, wrapper.get_or_create_table, 'project_id', 'dataset_id', table_id, bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False)
def _get_beam_bigquery_schema( fields: Dict[str, Tuple[str, str]]) -> beam_bigquery.TableSchema: """Return a beam bigquery schema for the output table. Args: fields: dict of {'field_name': ['column_type', 'column_mode']} Returns: A bigquery table schema """ table_schema = beam_bigquery.TableSchema() for (name, (field_type, mode)) in fields.items(): field_schema = beam_bigquery.TableFieldSchema() field_schema.name = name field_schema.type = field_type field_schema.mode = mode table_schema.fields.append(field_schema) return table_schema
def market_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='total_matched', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='event_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='start_time', type='TIMESTAMP', mode='NULLABLE'), ])
def _parse_schema_field(field): """Parse a single schema field from dictionary. Args: field: Dictionary object containing serialized schema. Returns: A TableFieldSchema for a single column in BigQuery. """ schema = bigquery.TableFieldSchema() schema.name = field['name'] schema.type = field['type'] if 'mode' in field: schema.mode = field['mode'] else: schema.mode = 'NULLABLE' if 'description' in field: schema.description = field['description'] if 'fields' in field: schema.fields = [_parse_schema_field(x) for x in field['fields']] return schema
def get_bigquery_schema(): """ Returns a bigquery schema. """ from apache_beam.io.gcp.internal.clients import bigquery table_schema = bigquery.TableSchema() columns = (('topics', 'string', 'nullable'), ('title', 'string', 'nullable'), ('content', 'string', 'nullable'), ('title_embed', 'float', 'repeated'), ('content_embed', 'float', 'repeated')) for column in columns: column_schema = bigquery.TableFieldSchema() column_schema.name = column[0] column_schema.type = column[1] column_schema.mode = column[2] table_schema.fields.append(column_schema) return table_schema
def _get_table_schema(self): # type (None) -> bigquery.TableSchema schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema(name='field_1', type=Consts.TYPE_STRING, mode=Consts.MODE_NULLABLE)) schema.fields.append( bigquery.TableFieldSchema(name='field_2', type=Consts.TYPE_INTEGER, mode=Consts.MODE_REPEATED)) # Record field. record_field = bigquery.TableFieldSchema( name='record_1', type=Consts.TYPE_RECORD, mode=Consts.MODE_REPEATED, ) record_field.fields.append( bigquery.TableFieldSchema( name='record_1_field_1', type=Consts.TYPE_BOOLEAN, mode=Consts.MODE_NULLABLE, )) record_field.fields.append( bigquery.TableFieldSchema(name='record_1_field_2', type=Consts.TYPE_FLOAT, mode=Consts.MODE_REPEATED)) # Record field, two level deep. deep_record_field = bigquery.TableFieldSchema( name='record_1-record_2', type=Consts.TYPE_RECORD, mode=Consts.MODE_REPEATED) deep_record_field.fields.append( bigquery.TableFieldSchema(name='record_1-record_2_field_1', type=Consts.TYPE_BOOLEAN, mode=Consts.MODE_NULLABLE)) record_field.fields.append(deep_record_field) schema.fields.append(record_field) return schema