예제 #1
0
 def testConvertJsonDataFileWhenTypeChanges(self):
     """Test ConvertJsonDataFile()."""
     infile = tempfile.NamedTemporaryFile(mode='rw+')
     outfile = tempfile.NamedTemporaryFile(mode='w+')
     json_before = '{"age": "22", "fullname": "John Doe"    }\n'
     # change: 22 is now an int.
     json_after = {'age': 22, 'fullname': 'John Doe'}
     infile.seek(0)
     infile.write(json_before)
     infile.seek(0)
     master_key = '%s' % _MASTER_KEY
     schema = [
         {
             'mode': 'nullable',
             'name': 'age',
             'type': 'integer',
             'encrypt': 'none'
         },
         {
             'mode': 'nullable',
             'name': 'fullname',
             'type': 'string',
             'encrypt': 'none'
         },
     ]
     table_id = '%s' % _TABLE_ID
     load_lib.ConvertJsonDataFile(schema, master_key, table_id, infile.name,
                                  outfile.name)
     # compare as dict because key order is flaky in str(dict)
     json_output = json.loads(outfile.read())
     self.assertEqual(json_output, json_after)
예제 #2
0
 def testConvertJsonDataFileUSuffixRegression(self):
     """Test ConvertJsonDataFile() for regression of str last-u fix."""
     infile = tempfile.NamedTemporaryFile(mode='a+')
     outfile = tempfile.NamedTemporaryFile(mode='w+')
     # test utf8 and unicode stability while here.
     csym = u'\u00a9'  # unicode: (C)
     csym_utf8 = csym.encode('utf-8')
     json_before = '{"ustr": "foo%s", "bstr": "foou" }\n' % csym_utf8
     json_after = {'ustr': u'foo%s' % csym, 'bstr': 'foou'}
     infile.seek(0)
     infile.write(json_before)
     infile.seek(0)
     master_key = '%s' % _MASTER_KEY
     schema = [
         {
             'mode': 'nullable',
             'name': 'ustr',
             'type': 'string',
             'encrypt': 'none'
         },
         {
             'mode': 'nullable',
             'name': 'bstr',
             'type': 'string',
             'encrypt': 'none'
         },
     ]
     table_id = '%s' % _TABLE_ID
     load_lib.ConvertJsonDataFile(schema, master_key, table_id, infile.name,
                                  outfile.name)
     # compare as json loaded structure because serialized format is unstable
     json_output = json.loads(outfile.read())
     self.assertEqual(json_output, json_after)
예제 #3
0
    def testConvertJsonDataFile(self):
        schema = json.loads(test_util.GetPlacesSchemaString())
        infile = self._WriteTempPlacesJsonFile()
        outfile = os.path.join(self.dirname, 'places.enc_data')
        master_key = base64.b64decode(_MASTER_KEY)
        string_hasher = ecrypto.StringHash(
            ecrypto.GenerateStringHashKey(master_key, _TABLE_ID))
        load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile,
                                     outfile)
        # validate new data file against new rewritten schema.
        new_schema = json.loads(_PLACES_REWRITTEN_SCHEMA)
        load_lib._ValidateJsonDataFile(new_schema, outfile)
        fout = open(outfile, 'rt')
        for line in fout:
            data = json.loads(line)
            break
        self.assertEqual(data['kind'], 'person')
        self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data)
        (model_iv,
         model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ')
        expected_model_key_hash = string_hasher.GetStringKeyHash(
            util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower())
        expected_model_hash = base64.b64encode(
            hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
        self.assertEquals(expected_model_hash, model_hash)
        self.assertTrue(util.SEARCHWORDS_PREFIX +
                        u'place' in data['citiesLived'][0])
        (model_iv,
         model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX +
                                              u'place'].split(' ')
        expected_model_key_hash = string_hasher.GetStringKeyHash(
            util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower())
        expected_model_hash = base64.b64encode(
            hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
        self.assertEquals(expected_model_hash, model_hash)
        self.assertEquals(data['spouse']['spouseAge'], 23)
        checked = []

        # look for lat,long in citiesLived
        found_any = False
        for city in data['citiesLived']:
            checked.append(city)
            if city.get('lat', None) is None:
                continue
            found_any = True
            self.assertTrue(isinstance(city['lat'], float))
            self.assertTrue(isinstance(city['long'], float))
            self.assertTrue(city['lat'] >= 0.0)
            self.assertTrue(city['long'] >= 0.0)
        self.assertTrue(
            found_any, 'found_any %s checked ( %s )' %
            (found_any, ' , '.join(map(str, checked))))
        fout.close()
예제 #4
0
 def testConvertComplexJsonDataFile(self):
     schema = json.loads(test_util.GetJobsSchemaString())
     infile = self._WriteTempJobsJsonFile()
     outfile = os.path.join(self.dirname, 'jobs.enc_data')
     master_key = base64.b64decode(_MASTER_KEY)
     string_hasher = ecrypto.StringHash(
         ecrypto.GenerateStringHashKey(master_key, _TABLE_ID))
     load_lib.ConvertJsonDataFile(schema, master_key, _TABLE_ID, infile,
                                  outfile)
     # validate new data file against new rewritten schema.
     new_schema = json.loads(_JOBS_REWRITTEN_SCHEMA)
     load_lib._ValidateJsonDataFile(new_schema, outfile)
     fout = open(outfile, 'rt')
     for line in fout:
         data = json.loads(line)
         break
     self.assertEqual(data['kind'], 'person')
     self.assertTrue(util.SEARCHWORDS_PREFIX + u'gender' in data)
     (model_iv,
      model_hash) = data[util.SEARCHWORDS_PREFIX + u'gender'].split(' ')
     expected_model_key_hash = string_hasher.GetStringKeyHash(
         util.SEARCHWORDS_PREFIX + u'gender', u'Male'.lower())
     expected_model_hash = base64.b64encode(
         hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
     self.assertEquals(expected_model_hash, model_hash)
     self.assertTrue(util.SEARCHWORDS_PREFIX +
                     u'place' in data['citiesLived'][0])
     (model_iv,
      model_hash) = data['citiesLived'][0][util.SEARCHWORDS_PREFIX +
                                           u'place'].split(' ')
     expected_model_key_hash = string_hasher.GetStringKeyHash(
         util.SEARCHWORDS_PREFIX + u'place', u'Seattle'.lower())
     expected_model_hash = base64.b64encode(
         hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8])
     self.assertEquals(expected_model_hash, model_hash)
     self.assertEquals(data['citiesLived'][0]['job'][0]['jobRank'], 1)
     self.assertEquals(data['citiesLived'][1]['job'], [])
     self.assertEquals(
         len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX +
                                              u'manager']), 3)
     self.assertEquals(
         len(data['citiesLived'][0]['job'][0][util.SEARCHWORDS_PREFIX +
                                              u'manager'][0].split(' ')), 4)
     fout.close()
    def Load(self, destination_table, source, schema=None, **kwds):
        """Encrypt the given data and then load it into BigQuery.

    The job will execute synchronously if sync=True is provided as an
    argument.

    Args:
      destination_table: TableReference to load data into.
      source: String specifying source data to load.
      schema: The schema that defines fields to be loaded.
      **kwds: Passed on to self.ExecuteJob.

    Returns:
      The resulting job info.
    """
        self._CheckKeyfileFlag()
        self._CheckSchemaFile(schema)

        # To make encrypting more secure, we use different keys for each table
        # and cipher. To generate a different key for each table, we need a distinct
        # table identifier for each table. A table name is not secure since a table
        # can be deleted and created with the same name and, thus the same key. The
        # only distinct identifier happens to be creation time. Therefore, we must
        # construct a table if it does not exist so we can use the creation time
        # to encrypt values.
        try:
            self.CreateTable(destination_table, schema=schema)
        except bigquery_client.BigqueryDuplicateError:
            pass  # Table already exists.

        temp_dir = tempfile.mkdtemp()
        orig_schema = load_lib.ReadSchemaFile(schema)
        new_schema = load_lib.RewriteSchema(orig_schema)
        new_schema_file = '%s/schema.enc_schema' % temp_dir
        # write the new schema as a json file
        with open(new_schema_file, 'wt') as f:
            json.dump(new_schema, f, indent=2)
        new_source_file = '%s/data.enc_data' % temp_dir
        # TODO(user): Put the filepath to the master key in .bigqueryrc file.
        master_key = load_lib.ReadMasterKeyFile(self.master_key_filename, True)
        table_name = str(destination_table).split(':')[-1]
        table_id = '%s_%s' % (
            table_name, self._GetTableCreationTime(str(destination_table)))
        hashed_table_key, table_version, table_schema = self._GetEBQTableInfo(
            str(destination_table))
        hashed_master_key = hashlib.sha1(master_key)
        # pylint: disable=too-many-function-args
        hashed_master_key = base64.b64encode(hashed_master_key.digest())
        if hashed_master_key != hashed_table_key:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid master key for this table.', None, None, None)
        if table_version != util.EBQ_TABLE_VERSION:
            raise bigquery_client.BigqueryNotFoundError(
                'Invalid table version.', None, None, None)
        # TODO(user): Generate a different key.
        cipher = ecrypto.ProbabilisticCipher(master_key)
        table_schema = cipher.Decrypt(base64.b64decode(table_schema), raw=True)
        table_schema = zlib.decompress(table_schema)
        table_schema = table_schema.decode('utf-8')
        table_schema = json.loads(table_schema)
        if table_schema != orig_schema:
            raise bigquery_client.BigqueryAccessDeniedError(
                'Invalid schema for this table.', None, None, None)
        if kwds['source_format'] == 'NEWLINE_DELIMITED_JSON':
            load_lib.ConvertJsonDataFile(orig_schema, master_key, table_id,
                                         source, new_source_file)
        elif kwds['source_format'] == 'CSV' or not kwds['source_format']:
            load_lib.ConvertCsvDataFile(orig_schema, master_key, table_id,
                                        source, new_source_file)
        else:
            raise app.UsageError(
                'Currently, we do not allow loading from file types other than\n'
                'NEWLINE_DELIMITED_JSON and CSV.')
        job = super(EncryptedBigqueryClient, self).Load(destination_table,
                                                        new_source_file,
                                                        schema=new_schema_file,
                                                        **kwds)
        try:
            shutil.rmtree(temp_dir)
        except OSError:
            raise OSError('Temp file deleted by user before termination.')
        return job