def testReadandValidateMultipleNestedSchemaFromFile(self): infile = os.path.join(self.dirname, 'test_multiple_nested_schema_file') f = open(infile, 'wt') f.write(test_util.GetJobsSchemaString()) f.close() read_schema = load_lib.ReadSchemaFile(infile) load_lib._ValidateExtendedSchema(read_schema) expected_schema = json.loads(test_util.GetJobsSchemaString()) self.assertEquals(expected_schema, read_schema) # append some non-json text and check failure. f = open(infile, 'at') f.write('bogus') f.close() try: load_lib.ReadSchemaFile(infile) self.fail() except ValueError: pass # success
def CreateTable(self, reference, ignore_existing=False, schema=None, description=None, friendly_name=None, expiration=None): """Create a table corresponding to TableReference. Arguments: reference: the TableReference to create. ignore_existing: (boolean, default False) If False, raise an exception if the dataset already exists. schema: An required schema (also requires a master key). description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. Raises: TypeError: if reference is not a TableReference. BigqueryDuplicateError: if reference exists and ignore_existing is False. """ if schema is None: raise bigquery_client.BigqueryNotFoundError( 'A schema must be specified when making a table.', None, None, None) self._CheckKeyfileFlag() schema = load_lib.ReadSchemaFile(schema) master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) # pylint: disable=too-many-function-args hashed_key = base64.b64encode(hashlib.sha1(master_key).digest()) cipher = ecrypto.ProbabilisticCipher(master_key) pretty_schema = json.dumps(schema) pretty_schema = pretty_schema.encode('utf-8') pretty_schema = zlib.compress(pretty_schema) encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema)) if description is None: description = '' new_description = util.ConstructTableDescription( description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema) new_schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).CreateTable(reference, ignore_existing, new_schema, new_description, friendly_name, expiration)
def Load(self, destination_table, source, schema=None, **kwds): """Encrypt the given data and then load it into BigQuery. The job will execute synchronously if sync=True is provided as an argument. Args: destination_table: TableReference to load data into. source: String specifying source data to load. schema: The schema that defines fields to be loaded. **kwds: Passed on to self.ExecuteJob. Returns: The resulting job info. """ self._CheckKeyfileFlag() self._CheckSchemaFile(schema) # To make encrypting more secure, we use different keys for each table # and cipher. To generate a different key for each table, we need a distinct # table identifier for each table. A table name is not secure since a table # can be deleted and created with the same name and, thus the same key. The # only distinct identifier happens to be creation time. Therefore, we must # construct a table if it does not exist so we can use the creation time # to encrypt values. try: self.CreateTable(destination_table, schema=schema) except bigquery_client.BigqueryDuplicateError: pass # Table already exists. temp_dir = tempfile.mkdtemp() orig_schema = load_lib.ReadSchemaFile(schema) new_schema = load_lib.RewriteSchema(orig_schema) new_schema_file = '%s/schema.enc_schema' % temp_dir # write the new schema as a json file with open(new_schema_file, 'wt') as f: json.dump(new_schema, f, indent=2) new_source_file = '%s/data.enc_data' % temp_dir # TODO(user): Put the filepath to the master key in .bigqueryrc file. master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) table_name = str(destination_table).split(':')[-1] table_id = '%s_%s' % ( table_name, self._GetTableCreationTime(str(destination_table))) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( str(destination_table)) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) # TODO(user): Generate a different key. cipher = ecrypto.ProbabilisticCipher(master_key) table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True) table_schema = zlib.decompress(table_schema) table_schema = table_schema.decode('utf-8') table_schema = json.loads(table_schema) if table_schema != orig_schema: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid schema for this table.', None, None, None) if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON': load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id, source, new_source_file) elif kwds['source_format'] == 'CSV' or not kwds['source_format']: load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id, source, new_source_file) else: raise app.UsageError( 'Currently, we do not allow loading from file types other than\n' 'NEWLINE_DELIMITED_JSON and CSV.') job = super(EncryptedBigqueryClient, self).Load(destination_table, new_source_file, schema=new_schema_file, **kwds) try: shutil.rmtree(temp_dir) except OSError: raise OSError('Temp file deleted by user before termination.') return job