def _GetTableCreationTime(self, identifier):
     reference = super(EncryptedBigqueryClient,
                       self).GetReference(identifier)
     object_info = super(EncryptedBigqueryClient,
                         self).GetObjectInfo(reference)
     if object_info is None:
         raise bigquery_client.BigqueryNotFoundError(
             'Table %s not found.' % identifier, None, None, None)
     if 'creationTime' not in object_info:
         raise bigquery_client.BigqueryNotFoundError(
             'Could not gather creation time from table.', None, None, None)
     return object_info['creationTime']
Exemplo n.º 2
0
def ReadMasterKeyFile(filepath, create=False):
    """Read and return master key from file else create and store key in file."""
    if not filepath:
        raise bigquery_client.BigqueryNotFoundError(
            'Master key file not specified.', None, None, None)
    if not os.path.exists(filepath):
        if not create:
            raise bigquery_client.BigqueryNotFoundError(
                'Master key file does not exist.', None, None, None)
        print 'Key file does not exist. Generating a new key now.'
        _CreateAndStoreMasterKeyFile(filepath)
    with open(filepath, 'rt') as f:
        master_key = base64.b64decode(f.read())
        if len(master_key) < 16:
            raise EncryptConvertError(
                'key in %s file is too short and may be '
                'corrupted. Please supply a proper key file. ' % filepath)
    return master_key
    def Query(self, query, **kwds):
        """Execute the given query, returning the created job and info for print.

    Arguments:
      query: Query to execute.
      **kwds: Passed on to BigqueryClient.ExecuteJob.

    Returns:
      The resulting job info and other info necessary for printing.
    """
        self._CheckKeyfileFlag()
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename)

        try:
            clauses = parser.ParseQuery(query)
        except ParseException as e:
            raise bigquery_client.BigqueryInvalidQueryError(
                e, None, None, None)
        if clauses['FROM']:
            table_id = '%s_%s' % (clauses['FROM'][0],
                                  self._GetTableCreationTime(
                                      clauses['FROM'][0]))
            hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
                clauses['FROM'][0])
            hashed_master_key = hashlib.sha1(master_key)
            # pylint: disable=too-many-function-args
            hashed_master_key = base64.b64encode(hashed_master_key.digest())
            if hashed_master_key != hashed_table_key:
                raise bigquery_client.BigqueryAccessDeniedError(
                    'Invalid master key for this table.', None, None, None)
            if table_version != util.EBQ_TABLE_VERSION:
                raise bigquery_client.BigqueryNotFoundError(
                    'Invalid table version.', None, None, None)
            cipher = ecrypto.ProbabilisticCipher(master_key)
            orig_schema = zlib.decompress(
                cipher.Decrypt(base64.b64decode(table_schema), raw=True))
            orig_schema = json.loads(orig_schema.decode('utf-8'))
        else:
            table_id = None
            orig_schema = []

        manifest = query_lib.QueryManifest.Generate()
        rewritten_query, print_args = query_lib.RewriteQuery(
            clauses, orig_schema, master_key, table_id, manifest)
        job = super(EncryptedBigqueryClient,
                    self).Query(rewritten_query, **kwds)
        self._LoadJobStatistics(manifest, job)

        printer = EncryptedTablePrinter(**print_args)
        bq.Factory.ClientTablePrinter.SetTablePrinter(printer)

        return job
 def _GetEBQTableInfo(self, identifier):
     reference = super(EncryptedBigqueryClient,
                       self).GetReference(identifier)
     object_info = super(EncryptedBigqueryClient,
                         self).GetObjectInfo(reference)
     if object_info is None:
         raise bigquery_client.BigqueryNotFoundError(
             'Table %s not found.' % identifier, None, None, None)
     if 'description' not in object_info:
         raise bigquery_client.BigqueryNotFoundError(
             'Could not get essential EBQ info from description. Only use ebq '
             'update to edit table descriptions. Using bq will cause the table '
             'to be unusable.', None, None, None)
     description = object_info['description'].split('||')
     try:
         hashed_key = description[-3].split('Hash of master key: ')[1]
         version_number = description[-2].split('Version: ')[1]
         schema = description[-1].split('Schema: ')[1]
     except Exception:
         raise bigquery_client.BigqueryNotFoundError(
             'Corrupt description containing essential EBQ info.', None,
             None, None)
     return hashed_key, version_number, schema
    def CreateTable(self,
                    reference,
                    ignore_existing=False,
                    schema=None,
                    description=None,
                    friendly_name=None,
                    expiration=None):
        """Create a table corresponding to TableReference.

    Arguments:
      reference: the TableReference to create.
      ignore_existing: (boolean, default False) If False, raise an exception if
        the dataset already exists.
      schema: An required schema (also requires a master key).
      description: an optional table description.
      friendly_name: an optional friendly name for the table.
      expiration: optional expiration time in milliseconds since the epoch.

    Raises:
      TypeError: if reference is not a TableReference.
      BigqueryDuplicateError: if reference exists and ignore_existing
        is False.
    """
        if schema is None:
            raise bigquery_client.BigqueryNotFoundError(
                'A schema must be specified when making a table.', None, None,
                None)
        self._CheckKeyfileFlag()
        schema = load_lib.ReadSchemaFile(schema)
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        # pylint: disable=too-many-function-args
        hashed_key = base64.b64encode(hashlib.sha1(master_key).digest())
        cipher = ecrypto.ProbabilisticCipher(master_key)
        pretty_schema = json.dumps(schema)
        pretty_schema = pretty_schema.encode('utf-8')
        pretty_schema = zlib.compress(pretty_schema)
        encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema))
        if description is None:
            description = ''
        new_description = util.ConstructTableDescription(
            description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema)
        new_schema = load_lib.RewriteSchema(schema)
        super(EncryptedBigqueryClient,
              self).CreateTable(reference, ignore_existing, new_schema,
                                new_description, friendly_name, expiration)
    def Load(self, destination_table, source, schema=None, **kwds):
        """Encrypt the given data and then load it into BigQuery.

    The job will execute synchronously if sync=True is provided as an
    argument.

    Args:
      destination_table: TableReference to load data into.
      source: String specifying source data to load.
      schema: The schema that defines fields to be loaded.
      **kwds: Passed on to self.ExecuteJob.

    Returns:
      The resulting job info.
    """
        self._CheckKeyfileFlag()
        self._CheckSchemaFile(schema)

        # To make encrypting more secure, we use different keys for each table
        # and cipher. To generate a different key for each table, we need a distinct
        # table identifier for each table. A table name is not secure since a table
        # can be deleted and created with the same name and, thus the same key. The
        # only distinct identifier happens to be creation time. Therefore, we must
        # construct a table if it does not exist so we can use the creation time
        # to encrypt values.
        try:
            self.CreateTable(destination_table, schema=schema)
        except bigquery_client.BigqueryDuplicateError:
            pass  # Table already exists.

        temp_dir = tempfile.mkdtemp()
        orig_schema = load_lib.ReadSchemaFile(schema)
        new_schema = load_lib.RewriteSchema(orig_schema)
        new_schema_file = '%s/schema.enc_schema' % temp_dir
        # write the new schema as a json file
        with open(new_schema_file, 'wt') as f:
            json.dump(new_schema, f, indent=2)
        new_source_file = '%s/data.enc_data' % temp_dir
        # TODO(user): Put the filepath to the master key in .bigqueryrc file.
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        table_name = str(destination_table).split(':')[-1]
        table_id = '%s_%s' % (
            table_name, self._GetTableCreationTime(str(destination_table)))
        hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
            str(destination_table))
        hashed_master_key = hashlib.sha1(master_key)
        # pylint: disable=too-many-function-args
        hashed_master_key = base64.b64encode(hashed_master_key.digest())
        if hashed_master_key != hashed_table_key:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid master key for this table.', None, None, None)
        if table_version != util.EBQ_TABLE_VERSION:
            raise bigquery_client.BigqueryNotFoundError(
                'Invalid table version.', None, None, None)
        # TODO(user): Generate a different key.
        cipher = ecrypto.ProbabilisticCipher(master_key)
        table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True)
        table_schema = zlib.decompress(table_schema)
        table_schema = table_schema.decode('utf-8')
        table_schema = json.loads(table_schema)
        if table_schema != orig_schema:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid schema for this table.', None, None, None)
        if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON':
            load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id,
                                         source, new_source_file)
        elif kwds['source_format'] == 'CSV' or not kwds['source_format']:
            load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id,
                                        source, new_source_file)
        else:
            raise app.UsageError(
                'Currently, we do not allow loading from file types other than\n'
                'NEWLINE_DELIMITED_JSON and CSV.')
        job = super(EncryptedBigqueryClient, self).Load(destination_table,
                                                        new_source_file,
                                                        schema=new_schema_file,
                                                        **kwds)
        try:
            shutil.rmtree(temp_dir)
        except OSError:
            raise OSError('Temp file deleted by user before termination.')
        return job