def testReadMasterKeyFile(self): infile = os.path.join(self.dirname, 'created_master_key_file') self.assertFalse(os.path.exists(infile)) load_lib._CreateAndStoreMasterKeyFile(infile) self.assertTrue(os.path.exists(infile)) f = open(infile, 'rt') master_key = base64.b64decode(f.read()) self.assertEquals(master_key, load_lib.ReadMasterKeyFile(infile)) f.close()
def Query(self, query, **kwds): """Execute the given query, returning the created job and info for print. Arguments: query: Query to execute. **kwds: Passed on to BigqueryClient.ExecuteJob. Returns: The resulting job info and other info necessary for printing. """ self._CheckKeyfileFlag() master_key = load_lib.ReadMasterKeyFile(self.master_key_filename) try: clauses = parser.ParseQuery(query) except ParseException as e: raise bigquery_client.BigqueryInvalidQueryError( e, None, None, None) if clauses['FROM']: table_id = '%s_%s' % (clauses['FROM'][0], self._GetTableCreationTime( clauses['FROM'][0])) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( clauses['FROM'][0]) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) cipher = ecrypto.ProbabilisticCipher(master_key) orig_schema = zlib.decompress( cipher.Decrypt(base64.b64decode(table_schema), raw=True)) orig_schema = json.loads(orig_schema.decode('utf-8')) else: table_id = None orig_schema = [] manifest = query_lib.QueryManifest.Generate() rewritten_query, print_args = query_lib.RewriteQuery( clauses, orig_schema, master_key, table_id, manifest) job = super(EncryptedBigqueryClient, self).Query(rewritten_query, **kwds) self._LoadJobStatistics(manifest, job) printer = EncryptedTablePrinter(**print_args) bq.Factory.ClientTablePrinter.SetTablePrinter(printer) return job
def CreateTable(self, reference, ignore_existing=False, schema=None, description=None, friendly_name=None, expiration=None): """Create a table corresponding to TableReference. Arguments: reference: the TableReference to create. ignore_existing: (boolean, default False) If False, raise an exception if the dataset already exists. schema: An required schema (also requires a master key). description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. Raises: TypeError: if reference is not a TableReference. BigqueryDuplicateError: if reference exists and ignore_existing is False. """ if schema is None: raise bigquery_client.BigqueryNotFoundError( 'A schema must be specified when making a table.', None, None, None) self._CheckKeyfileFlag() schema = load_lib.ReadSchemaFile(schema) master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) # pylint: disable=too-many-function-args hashed_key = base64.b64encode(hashlib.sha1(master_key).digest()) cipher = ecrypto.ProbabilisticCipher(master_key) pretty_schema = json.dumps(schema) pretty_schema = pretty_schema.encode('utf-8') pretty_schema = zlib.compress(pretty_schema) encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema)) if description is None: description = '' new_description = util.ConstructTableDescription( description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema) new_schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).CreateTable(reference, ignore_existing, new_schema, new_description, friendly_name, expiration)
def UpdateTable(self, reference, schema=None, description=None, friendly_name=None, expiration=None): """Updates a table. Arguments: reference: the DatasetReference to update. schema: an optional schema. description: an optional table description. friendly_name: an optional friendly name for the table. expiration: optional expiration time in milliseconds since the epoch. """ if schema: self._CheckKeyfileFlag() if description: hashed_table_key, table_version, table_schema = ( self._GetEBQTableInfo(str(reference))) if schema: master_key = load_lib.ReadMasterKeyFile( self.master_key_filename) # pylint: disable=too-many-function-args hashed_key = base64.b64encode( hashlib.sha1(master_key).digest()) if hashed_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) cipher = ecrypto.ProbabilisticCipher(master_key) real_schema = json.dumps(load_lib.RewriteSchema(schema)) real_schema = str.encode('utf-8') table_schema = base64.b64encode( cipher.Encrypt(zlib.compress(real_schema))) description = util.ConstructTableDescription( description, hashed_table_key, table_version, table_schema) # Rewrite the schema if the schema is to be updated. if schema: schema = load_lib.RewriteSchema(schema) super(EncryptedBigqueryClient, self).UpdateTable(reference, schema, description, friendly_name, expiration)
def Load(self, destination_table, source, schema=None, **kwds): """Encrypt the given data and then load it into BigQuery. The job will execute synchronously if sync=True is provided as an argument. Args: destination_table: TableReference to load data into. source: String specifying source data to load. schema: The schema that defines fields to be loaded. **kwds: Passed on to self.ExecuteJob. Returns: The resulting job info. """ self._CheckKeyfileFlag() self._CheckSchemaFile(schema) # To make encrypting more secure, we use different keys for each table # and cipher. To generate a different key for each table, we need a distinct # table identifier for each table. A table name is not secure since a table # can be deleted and created with the same name and, thus the same key. The # only distinct identifier happens to be creation time. Therefore, we must # construct a table if it does not exist so we can use the creation time # to encrypt values. try: self.CreateTable(destination_table, schema=schema) except bigquery_client.BigqueryDuplicateError: pass # Table already exists. temp_dir = tempfile.mkdtemp() orig_schema = load_lib.ReadSchemaFile(schema) new_schema = load_lib.RewriteSchema(orig_schema) new_schema_file = '%s/schema.enc_schema' % temp_dir # write the new schema as a json file with open(new_schema_file, 'wt') as f: json.dump(new_schema, f, indent=2) new_source_file = '%s/data.enc_data' % temp_dir # TODO(user): Put the filepath to the master key in .bigqueryrc file. master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True) table_name = str(destination_table).split(':')[-1] table_id = '%s_%s' % ( table_name, self._GetTableCreationTime(str(destination_table))) hashed_table_key, table_version, table_schema = self._GetEBQTableInfo( str(destination_table)) hashed_master_key = hashlib.sha1(master_key) # pylint: disable=too-many-function-args hashed_master_key = base64.b64encode(hashed_master_key.digest()) if hashed_master_key != hashed_table_key: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid master key for this table.', None, None, None) if table_version != util.EBQ_TABLE_VERSION: raise bigquery_client.BigqueryNotFoundError( 'Invalid table version.', None, None, None) # TODO(user): Generate a different key. cipher = ecrypto.ProbabilisticCipher(master_key) table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True) table_schema = zlib.decompress(table_schema) table_schema = table_schema.decode('utf-8') table_schema = json.loads(table_schema) if table_schema != orig_schema: raise bigquery_client.BigqueryAccessDeniedError( 'Invalid schema for this table.', None, None, None) if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON': load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id, source, new_source_file) elif kwds['source_format'] == 'CSV' or not kwds['source_format']: load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id, source, new_source_file) else: raise app.UsageError( 'Currently, we do not allow loading from file types other than\n' 'NEWLINE_DELIMITED_JSON and CSV.') job = super(EncryptedBigqueryClient, self).Load(destination_table, new_source_file, schema=new_schema_file, **kwds) try: shutil.rmtree(temp_dir) except OSError: raise OSError('Temp file deleted by user before termination.') return job