예제 #1
0
 def testReadandValidateMultipleNestedSchemaFromFile(self):
     infile = os.path.join(self.dirname, 'test_multiple_nested_schema_file')
     f = open(infile, 'wt')
     f.write(test_util.GetJobsSchemaString())
     f.close()
     read_schema = load_lib.ReadSchemaFile(infile)
     load_lib._ValidateExtendedSchema(read_schema)
     expected_schema = json.loads(test_util.GetJobsSchemaString())
     self.assertEquals(expected_schema, read_schema)
     # append some non-json text and check failure.
     f = open(infile, 'at')
     f.write('bogus')
     f.close()
     try:
         load_lib.ReadSchemaFile(infile)
         self.fail()
     except ValueError:
         pass  # success
    def CreateTable(self,
                    reference,
                    ignore_existing=False,
                    schema=None,
                    description=None,
                    friendly_name=None,
                    expiration=None):
        """Create a table corresponding to TableReference.

    Arguments:
      reference: the TableReference to create.
      ignore_existing: (boolean, default False) If False, raise an exception if
        the dataset already exists.
      schema: An required schema (also requires a master key).
      description: an optional table description.
      friendly_name: an optional friendly name for the table.
      expiration: optional expiration time in milliseconds since the epoch.

    Raises:
      TypeError: if reference is not a TableReference.
      BigqueryDuplicateError: if reference exists and ignore_existing
        is False.
    """
        if schema is None:
            raise bigquery_client.BigqueryNotFoundError(
                'A schema must be specified when making a table.', None, None,
                None)
        self._CheckKeyfileFlag()
        schema = load_lib.ReadSchemaFile(schema)
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        # pylint: disable=too-many-function-args
        hashed_key = base64.b64encode(hashlib.sha1(master_key).digest())
        cipher = ecrypto.ProbabilisticCipher(master_key)
        pretty_schema = json.dumps(schema)
        pretty_schema = pretty_schema.encode('utf-8')
        pretty_schema = zlib.compress(pretty_schema)
        encrypted_schema = base64.b64encode(cipher.Encrypt(pretty_schema))
        if description is None:
            description = ''
        new_description = util.ConstructTableDescription(
            description, hashed_key, util.EBQ_TABLE_VERSION, encrypted_schema)
        new_schema = load_lib.RewriteSchema(schema)
        super(EncryptedBigqueryClient,
              self).CreateTable(reference, ignore_existing, new_schema,
                                new_description, friendly_name, expiration)
    def Load(self, destination_table, source, schema=None, **kwds):
        """Encrypt the given data and then load it into BigQuery.

    The job will execute synchronously if sync=True is provided as an
    argument.

    Args:
      destination_table: TableReference to load data into.
      source: String specifying source data to load.
      schema: The schema that defines fields to be loaded.
      **kwds: Passed on to self.ExecuteJob.

    Returns:
      The resulting job info.
    """
        self._CheckKeyfileFlag()
        self._CheckSchemaFile(schema)

        # To make encrypting more secure, we use different keys for each table
        # and cipher. To generate a different key for each table, we need a distinct
        # table identifier for each table. A table name is not secure since a table
        # can be deleted and created with the same name and, thus the same key. The
        # only distinct identifier happens to be creation time. Therefore, we must
        # construct a table if it does not exist so we can use the creation time
        # to encrypt values.
        try:
            self.CreateTable(destination_table, schema=schema)
        except bigquery_client.BigqueryDuplicateError:
            pass  # Table already exists.

        temp_dir = tempfile.mkdtemp()
        orig_schema = load_lib.ReadSchemaFile(schema)
        new_schema = load_lib.RewriteSchema(orig_schema)
        new_schema_file = '%s/schema.enc_schema' % temp_dir
        # write the new schema as a json file
        with open(new_schema_file, 'wt') as f:
            json.dump(new_schema, f, indent=2)
        new_source_file = '%s/data.enc_data' % temp_dir
        # TODO(user): Put the filepath to the master key in .bigqueryrc file.
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        table_name = str(destination_table).split(':')[-1]
        table_id = '%s_%s' % (
            table_name, self._GetTableCreationTime(str(destination_table)))
        hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
            str(destination_table))
        hashed_master_key = hashlib.sha1(master_key)
        # pylint: disable=too-many-function-args
        hashed_master_key = base64.b64encode(hashed_master_key.digest())
        if hashed_master_key != hashed_table_key:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid master key for this table.', None, None, None)
        if table_version != util.EBQ_TABLE_VERSION:
            raise bigquery_client.BigqueryNotFoundError(
                'Invalid table version.', None, None, None)
        # TODO(user): Generate a different key.
        cipher = ecrypto.ProbabilisticCipher(master_key)
        table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True)
        table_schema = zlib.decompress(table_schema)
        table_schema = table_schema.decode('utf-8')
        table_schema = json.loads(table_schema)
        if table_schema != orig_schema:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid schema for this table.', None, None, None)
        if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON':
            load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id,
                                         source, new_source_file)
        elif kwds['source_format'] == 'CSV' or not kwds['source_format']:
            load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id,
                                        source, new_source_file)
        else:
            raise app.UsageError(
                'Currently, we do not allow loading from file types other than\n'
                'NEWLINE_DELIMITED_JSON and CSV.')
        job = super(EncryptedBigqueryClient, self).Load(destination_table,
                                                        new_source_file,
                                                        schema=new_schema_file,
                                                        **kwds)
        try:
            shutil.rmtree(temp_dir)
        except OSError:
            raise OSError('Temp file deleted by user before termination.')
        return job