def set_mean(accumulator, metric_update): if accumulator.count: metric_update.meanSum = to_json_value(accumulator.sum, with_type=True) metric_update.meanCount = to_json_value(accumulator.count, with_type=True) else: # A denominator of 0 will raise an error in the service. # What it means is we have nothing to report yet, so don't. metric_update.kind = None
def test_row_as_table_row(self): schema_definition = [ ('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN'), ('r', 'RECORD')] data_defination = [ 'abc', 123, 123.456, True, {'a': 'b'}] str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}' schema = bigquery.TableSchema( fields=[bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in data_defination]) self.assertEqual(str_def, coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual( test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_rows_are_written(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND insert_response = mock.Mock() insert_response.insertErrors = [] client.tabledata.InsertAll.return_value = insert_response with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client) as writer: writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14}) sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14} expected_rows = [] json_object = bigquery.JsonObject() for k, v in sample_row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty(key=k, value=to_json_value(v))) expected_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId='_1', # First row ID generated with prefix '' json=json_object)) client.tabledata.InsertAll.assert_called_with( bigquery.BigqueryTabledataInsertAllRequest( projectId='project', datasetId='dataset', tableId='table', tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest( rows=expected_rows)))
def insert_rows(self, project_id, dataset_id, table_id, rows): """Inserts rows into the specified table. Args: project_id: The project id owning the table. dataset_id: The dataset id owning the table. table_id: The table id. rows: A list of plain Python dictionaries. Each dictionary is a row and each key in it is the name of a field. Returns: A tuple (bool, errors). If first element is False then the second element will be a bigquery.InserttErrorsValueListEntry instance containing specific errors. """ # Prepare rows for insertion. Of special note is the row ID that we add to # each row in order to help BigQuery avoid inserting a row multiple times. # BigQuery will do a best-effort if unique IDs are provided. This situation # can happen during retries on failures. # TODO(silviuc): Must add support to writing TableRow's instead of dicts. final_rows = [] for row in rows: json_object = bigquery.JsonObject() for k, v in row.iteritems(): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty( key=k, value=to_json_value(v))) final_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId=str(self.unique_row_id), json=json_object)) result, errors = self._insert_all_rows( project_id, dataset_id, table_id, final_rows) return result, errors
def test_row_and_no_schema(self): coder = TableRowJsonCoder() test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in ['abc', 123, 123.456, True]]) with self.assertRaises(AttributeError) as ctx: coder.encode(test_row) self.assertTrue( ctx.exception.message.startswith('The TableRowJsonCoder requires'))
def json_compliance_exception(self, value): with self.assertRaises(ValueError) as exn: schema_definition = [('f', 'FLOAT')] schema = bigquery.TableSchema( fields=[bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(value))]) coder.encode(test_row) self.assertTrue(bigquery.JSON_COMPLIANCE_ERROR in exn.exception.message)
def test_metric_update_basic(self): metric_update = dataflow.MetricUpdate() metric_update.name = dataflow.MetricStructuredName() metric_update.name.name = 'metric1' metric_update.name.origin = 'origin1' metric_update.cumulative = False metric_update.kind = 'sum' metric_update.scalar = to_json_value(1, with_type=True) name_matcher = message_matchers.MetricStructuredNameMatcher( name='metric1', origin='origin1') matcher = message_matchers.MetricUpdateMatcher(name=name_matcher, kind='sum', scalar=1) hc.assert_that(metric_update, hc.is_(matcher)) with self.assertRaises(AssertionError): matcher.kind = 'suma' hc.assert_that(metric_update, hc.is_(matcher))
def test_large_integer(self): num = 1 << 35 self.assertEquals(num, from_json_value(to_json_value(num))) self.assertEquals(long(num), from_json_value(to_json_value(long(num))))
def test_long_value(self): self.assertEquals(long(27), from_json_value(to_json_value(long(27))))
def test_with_type(self): rt = from_json_value(to_json_value('abcd', with_type=True)) self.assertEquals('http://schema.org/Text', rt['@type']) self.assertEquals('abcd', rt['value'])
def test_none_from(self): self.assertIsNone(from_json_value(to_json_value(None)))
def test_true_to(self): self.assertEquals(JsonValue(boolean_value=True), to_json_value(True))
def test_float_from(self): self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
def set_scalar(accumulator, metric_update): metric_update.scalar = to_json_value(accumulator.value, with_type=True)
def add_property(self, name, value, with_type=False): self._additional_properties.append((name, value, with_type)) self.proto.properties.additionalProperties.append( dataflow.Step.PropertiesValue.AdditionalProperty( key=name, value=to_json_value(value, with_type=with_type)))
def test_none_to(self): self.assertEquals(JsonValue(is_null=True), to_json_value(None))
def decode(self, encoded_table_row): od = json.loads( encoded_table_row, object_pairs_hook=collections.OrderedDict) return bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(e)) for e in od.itervalues()])
def test_float_to(self): self.assertEquals(JsonValue(double_value=2.75), to_json_value(2.75))
def test_int_to(self): self.assertEquals(JsonValue(integer_value=14), to_json_value(14))
def test_false_to(self): self.assertEquals(JsonValue(boolean_value=False), to_json_value(False))
def test_too_long_value(self): with self.assertRaises(TypeError): to_json_value(long(1 << 64))
def test_false_from(self): self.assertEquals(False, from_json_value(to_json_value(False)))
def get_test_rows(self): now = time.time() dt = datetime.datetime.utcfromtimestamp(float(now)) ts = dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC') expected_rows = [{ 'i': 1, 's': 'abc', 'f': 2.3, 'b': True, 't': ts, 'dt': '2016-10-31', 'ts': '22:39:12.627498', 'dt_ts': '2008-12-25T07:30:00', 'r': { 's2': 'b' }, 'rpr': [{ 's3': 'c', 'rpr2': [{ 'rs': ['d', 'e'], 's4': 'f' }] }] }, { 'i': 10, 's': 'xyz', 'f': -3.14, 'b': False, 'rpr': [] }] nested_schema = [ bigquery.TableFieldSchema(name='s2', type='STRING', mode='NULLABLE') ] nested_schema_2 = [ bigquery.TableFieldSchema(name='s3', type='STRING', mode='NULLABLE'), bigquery.TableFieldSchema( name='rpr2', type='RECORD', mode='REPEATED', fields=[ bigquery.TableFieldSchema(name='rs', type='STRING', mode='REPEATED'), bigquery.TableFieldSchema(name='s4', type='STRING', mode='NULLABLE') ]) ] schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED'), bigquery.TableFieldSchema(name='f', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='i', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema(name='s', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='t', type='TIMESTAMP', mode='NULLABLE'), bigquery.TableFieldSchema(name='dt', type='DATE', mode='NULLABLE'), bigquery.TableFieldSchema(name='ts', type='TIME', mode='NULLABLE'), bigquery.TableFieldSchema( name='dt_ts', type='DATETIME', mode='NULLABLE'), bigquery.TableFieldSchema( name='r', type='RECORD', mode='NULLABLE', fields=nested_schema), bigquery.TableFieldSchema(name='rpr', type='RECORD', mode='REPEATED', fields=nested_schema_2) ]) table_rows = [ bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('true')), bigquery.TableCell(v=to_json_value(str(2.3))), bigquery.TableCell(v=to_json_value(str(1))), bigquery.TableCell(v=to_json_value('abc')), # For timestamps cannot use str() because it will truncate the # number representing the timestamp. bigquery.TableCell(v=to_json_value('%f' % now)), bigquery.TableCell(v=to_json_value('2016-10-31')), bigquery.TableCell(v=to_json_value('22:39:12.627498')), bigquery.TableCell(v=to_json_value('2008-12-25T07:30:00')), # For record we cannot use dict because it doesn't create nested # schemas correctly so we have to use this f,v based format bigquery.TableCell(v=to_json_value({'f': [{ 'v': 'b' }]})), bigquery.TableCell(v=to_json_value([{ 'v': { 'f': [{ 'v': 'c' }, { 'v': [{ 'v': { 'f': [{ 'v': [{ 'v': 'd' }, { 'v': 'e' }] }, { 'v': 'f' }] } }] }] } }])) ]), bigquery.TableRow(f=[ bigquery.TableCell(v=to_json_value('false')), bigquery.TableCell(v=to_json_value(str(-3.14))), bigquery.TableCell(v=to_json_value(str(10))), bigquery.TableCell(v=to_json_value('xyz')), bigquery.TableCell(v=None), bigquery.TableCell(v=None), bigquery.TableCell(v=None), bigquery.TableCell(v=None), bigquery.TableCell(v=None), bigquery.TableCell(v=to_json_value([])) ]) ] return table_rows, schema, expected_rows
def test_int_from(self): self.assertEquals(-27, from_json_value(to_json_value(-27)))
def test_string_from(self): self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))
def test_string_to(self): self.assertEquals(JsonValue(string_value='abc'), to_json_value('abc'))
def test_true_from(self): self.assertEquals(True, from_json_value(to_json_value(True)))
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.debug_options = options.view_as(DebugOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE self.proto.dataset = '{}/cloud_dataflow'.format( GoogleCloudOptions.BIGQUERY_API_SERVICE) self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint if self.google_cloud_options.service_account_email: self.proto.serviceAccountEmail = ( self.google_cloud_options.service_account_email) sdk_name, version_string = get_sdk_name_and_version() self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value(sdk_name)), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string))]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version))]) # Experiments if self.debug_options.experiments: for experiment in self.debug_options.experiments: self.proto.experiments.append(experiment) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % ( self.google_cloud_options.staging_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT, servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: # Default to using the worker harness container image for the current SDK # version. pool.workerHarnessContainerImage = ( 'dataflow.gcr.io/v1beta3/python:%s' % get_required_container_version()) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( dataflow.WorkerPool .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC) else: pool.ipConfiguration = ( dataflow.WorkerPool .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) options_dict = {k: v for k, v in sdk_pipeline_options.iteritems() if v is not None} self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty( key='options', value=to_json_value(options_dict))) dd = DisplayData.create_from_options(options) items = [item.get_dict() for item in dd.items] self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty( key='display_data', value=to_json_value(items)))