def testConvertCsvDataFile(self): self._SetupTestFlags() schema = json.loads(test_util.GetCarsSchemaString()) infile = self._WriteTempCarsCsvFile() outfile = os.path.join(self.dirname, 'cars.enc_data') master_key = base64.b64decode(_MASTER_KEY) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, _TABLE_ID)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, _TABLE_ID)) load_lib.ConvertCsvDataFile(schema, master_key, _TABLE_ID, infile, outfile) # validate new data file against new rewritten schema. new_schema = json.loads(_CARS_REWRITTEN_SCHEMA) load_lib._ValidateCsvDataFile(new_schema, outfile) # Sanity check one row entries. Entries for semantic encrypted fields cannot # be checked because the values are randomized. fout = open(outfile, 'rt') row0 = fout.readline() self.assertTrue('1997' in row0) self.assertTrue(pseudonym_cipher.Encrypt(unicode('Ford')) in row0) # Get iv and hash for Model searchwords field whose value is 'E350' (model_iv, model_hash) = row0.split(',')[2].split(' ') # Calculate expected key hash value for 'E350' expected_model_key_hash = string_hasher.GetStringKeyHash( util.SEARCHWORDS_PREFIX + u'Model', u'E350'.lower()) # Calculate outer sha1 using model_iv and expected key hash. expected_model_hash = base64.b64encode( hashlib.sha1(model_iv + expected_model_key_hash).digest()[:8]) self.assertEquals(expected_model_hash, model_hash) fout.close()
def testDecryptValues(self): """Test _DecryptValues().""" cars_schema = test_util.GetCarsSchema() jobs_schema = test_util.GetJobsSchema() master_key = test_util.GetMasterKey() field = '%sInvoice_Price' % util.HOMOMORPHIC_INT_PREFIX table = [[1], [2], [3]] cipher = ecrypto.HomomorphicIntCipher(master_key) ciphers = {util.HOMOMORPHIC_INT_PREFIX: cipher} table = self._EncryptTable(cipher, table, 0) table.append([None]) column = encrypted_bigquery_client._DecryptValues( field, table, 0, ciphers, cars_schema, util.HOMOMORPHIC_INT_PREFIX) self.assertEqual(column, [1, 2, 3, util.LiteralToken('null', None)]) field = 'citiesLived.job.%sposition' % util.PSEUDONYM_PREFIX table = [[0, unicode('Hello')], [1, unicode('My')], [-1, unicode('job')]] cipher = ecrypto.PseudonymCipher(master_key) ciphers = {util.PSEUDONYM_PREFIX: cipher} table = self._EncryptTable(cipher, table, 1) table.insert(1, [100, None]) column = encrypted_bigquery_client._DecryptValues( field, table, 1, ciphers, jobs_schema, util.PSEUDONYM_PREFIX) self.assertEqual(column, [ util.StringLiteralToken('"Hello"'), util.LiteralToken('null', None), util.StringLiteralToken('"My"'), util.StringLiteralToken('"job"') ]) field = '%snonexistent_field' % util.HOMOMORPHIC_FLOAT_PREFIX self.assertRaises(ValueError, encrypted_bigquery_client._DecryptValues, field, table, 1, ciphers, cars_schema, util.HOMOMORPHIC_FLOAT_PREFIX)
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField( data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) rewritten_data = json.dumps(rewritten_data) out_file.write(rewritten_data + '\n')
def testDecryptGroupConcatValues(self): cars_schema = test_util.GetCarsSchema() jobs_schema = test_util.GetJobsSchema() master_key = test_util.GetMasterKey() query = 'GROUP_CONCAT(%sModel)' % util.PROBABILISTIC_PREFIX cipher = ecrypto.ProbabilisticCipher(master_key) ciphers = {util.PROBABILISTIC_PREFIX: cipher} unencrypted_values = ([['A', 'B', 'C', 'D'], ['1', '2', '3', '4'], ['Hello', 'Bye']]) table = [] for values in unencrypted_values: encrypted_values = [] for token in values: encrypted_values.append(cipher.Encrypt(unicode(token))) table.append([','.join(encrypted_values), random.random()]) table.insert(0, [None, None]) column = encrypted_bigquery_client._DecryptGroupConcatValues( query, table, 0, ciphers, cars_schema, util.PROBABILISTIC_PREFIX) self.assertEqual(column, [ util.LiteralToken('null', None), util.StringLiteralToken('"A,B,C,D"'), util.StringLiteralToken('"1,2,3,4"'), util.StringLiteralToken('"Hello,Bye"') ]) query = ( 'GROUP_CONCAT(citiesLived.job.%sposition) within citiesLived.job' % util.PSEUDONYM_PREFIX) cipher = ecrypto.PseudonymCipher(master_key) ciphers = {util.PSEUDONYM_PREFIX: cipher} table = [] for values in unencrypted_values: encrypted_values = [] for token in values: encrypted_values.append(cipher.Encrypt(unicode(token))) table.append([','.join(encrypted_values)]) column = encrypted_bigquery_client._DecryptGroupConcatValues( query, table, 0, ciphers, jobs_schema, util.PSEUDONYM_PREFIX) self.assertEqual(column, [ util.StringLiteralToken('"A,B,C,D"'), util.StringLiteralToken('"1,2,3,4"'), util.StringLiteralToken('"Hello,Bye"') ]) query = '%sModel' % util.PROBABILISTIC_PREFIX self.assertRaises(ValueError, encrypted_bigquery_client._DecryptGroupConcatValues, query, table, 0, ciphers, cars_schema, util.PROBABILISTIC_PREFIX) query = ( 'GROUP_CONCAT(citiesLived.%snumberOfYears) within citiesLived' % util.HOMOMORPHIC_FLOAT_PREFIX) self.assertRaises(bigquery_client.BigqueryInvalidQueryError, encrypted_bigquery_client._DecryptGroupConcatValues, query, table, 0, ciphers, jobs_schema, util.HOMOMORPHIC_FLOAT_PREFIX)
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField(data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) # When python prints unicode strings, it uses single quotes and # prepends a u before the string (such as u'Hello'). Json does # understand this and will only allow strings of double quotes # without any prefixes, therefore we must substitute to fit # the criteria. rewritten_data = str(rewritten_data).replace('u\'', '"') rewritten_data = rewritten_data.replace('\'', '"') out_file.write(rewritten_data + '\n')
def _GenerateRelatedCiphers(schema, master_key, default_cipher): """Reads schema for pseudonym encrypt types and adds generating ciphers. Args: schema: list of dict, the db schema. modified by master_key: str, the master key default_cipher: obj, cipher that encrypt() can be called on. Returns: dict, mapping field names to index in schema. """ map_name_to_index = {} for i in xrange(len(schema)): logging.warning(schema[i]) map_name_to_index[schema[i]['name']] = i if schema[i].get('encrypt', None) == 'pseudonym': related = schema[i].get('related', None) if related is not None: pseudonym_cipher_related = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey( master_key, str(related).encode('utf-8'))) schema[i]['cipher'] = pseudonym_cipher_related else: schema[i]['cipher'] = default_cipher return map_name_to_index
def setUp(self): """Run once for each test in the class.""" self.cipher = ecrypto.PseudonymCipher(_KEY1)
def RewriteSelectionCriteria(stack, schema, master_key, table_id): """Rewrites selection criteria (arguments of WHERE and HAVING clause). Arguments: stack: The postfix expression that is the where/having expression. schema: The user defined values and encryption. master_key: Used to get ciphers for encryption. table_id: Used to generate a proper key. Returns: An infix version of the <stack>. The expression is rewritten so that it can be sent to the BigQuery server. Raises: bigquery_client.BigqueryInvalidQueryError: If the expression is invalid (such as searching non-searchable encrypted fields, etc). """ pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) def FailIfEncrypted(tokens): if util.IsEncryptedExpression(tokens): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid where/having expression.', None, None, None) def FailIfDeterministic(tokens): if util.IsDeterministicExpression(tokens): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot do equality on probabilistic encryption, ' 'only pseudonym encryption.', None, None, None) def RewritePseudonymEncryption(token): if isinstance(token, util.StringLiteralToken): return '"%s"' % pseudonym_cipher.Encrypt(unicode(token[1:-1])) else: return token def RewriteSearchwordsEncryption(field, literal): """Rewrites the literal such that it can be checked for containment. Arguments: field: The field which is being checked if literal is contained within. literal: Substring being searched for. Returns: A tuple containing both field and literal rewritten. Raises: ValueError: Try to rewrite non-searchwords encryption. """ if (not isinstance(field, util.SearchwordsToken) and not isinstance(field, util.ProbabilisticToken)): raise ValueError('Invalid encryption to check containment.') field = field.original_name row = util.GetEntryFromSchema(field, schema) modified_field = util.SEARCHWORDS_PREFIX + row['name'] field = field.split('.') field[-1] = modified_field modified_field = '.'.join(field) if 'searchwords_separator' in row: searchwords_separator = row['searchwords_separator'] else: searchwords_separator = None word_list = ecrypto.CleanUnicodeString(unicode(literal.value), separator=searchwords_separator) if searchwords_separator is None: word_seq = ' '.join(word_list) else: word_seq = searchwords_separator.join(word_list) keyed_hash = (u'\'%s\'' % string_hasher.GetStringKeyHash( modified_field.split('.')[-1], word_seq)) modified_string = ( u'to_base64(left(bytes(sha1(concat(left(%s, 24), %s))), 8))' % (modified_field, keyed_hash)) return (modified_field, modified_string) def CheckSearchableField(op1): """Checks if the operand is a searchable encrypted field. Arguments: op1: The operand that is being checked if it is searchable. Returns: True iff op1 is searchable. """ if isinstance(op1, util.SearchwordsToken): return True elif not isinstance(op1, util.ProbabilisticToken): return False op1 = op1.original_name row = util.GetEntryFromSchema(op1, schema) if row['encrypt'] in ['probabilistic_searchwords', 'searchwords']: return True else: return False return False def RewriteContainsOrFail(op1, op2): """Tries to rewrite a contains expression. Arguments: op1: The first operand of the contains binary operator. op2: The second operand of the contians binary operator. Returns: The rewritten versions of both operands. Raises: bigquery_client.BigqueryInvalidQueryError: If the contains expressions is invalid. """ if not isinstance(op1, util.EncryptedToken): return (op1, op2) if not CheckSearchableField(op1): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot do contains on an encrypted field that is not searchable.', None, None, None) elif not isinstance(op2, util.StringLiteralToken): raise bigquery_client.BigqueryInvalidQueryError( 'The substring to be checked must be a literal.', None, None, None) return RewriteSearchwordsEncryption(op1, op2) def CheckAndRewriteStack(postfix): if not postfix: raise bigquery_client.BigqueryInvalidQueryError( 'Not enough arguments.', None, None, None) top = postfix.pop() if isinstance(top, util.OperatorToken): args = [] for unused_i in range(top.num_args): args.append(CheckAndRewriteStack(postfix)) args.reverse() if top.num_args == 1: return '%s %s' % (str(top), args[0]) elif str(top) in ['=', '==', '!=']: FailIfDeterministic(args) if (isinstance(args[0], util.PseudonymToken) or isinstance(args[1], util.PseudonymToken)): args[0] = RewritePseudonymEncryption(args[0]) args[1] = RewritePseudonymEncryption(args[1]) elif str(top) == 'contains': FailIfEncrypted([args[1]]) args[0], args[1] = RewriteContainsOrFail(args[0], args[1]) else: FailIfEncrypted(args) return '(%s %s %s)' % (args[0], str(top), args[1]) elif isinstance(top, util.BuiltInFunctionToken): func_name = str(top) if func_name in _ZERO_ARGUMENT_FUNCTIONS: return '%s()' % func_name elif func_name in _ONE_ARGUMENT_FUNCTIONS: op = CheckAndRewriteStack(postfix) FailIfEncrypted([op]) return '%s(%s)' % (func_name, op) elif func_name in _TWO_ARGUMENT_FUNCTIONS: op2 = CheckAndRewriteStack(postfix) op1 = CheckAndRewriteStack(postfix) FailIfEncrypted([op1, op2]) return '%s(%s, %s)' % (func_name, op1, op2) elif func_name in _THREE_ARGUMENT_FUNCTIONS: op3 = CheckAndRewriteStack(postfix) op2 = CheckAndRewriteStack(postfix) op1 = CheckAndRewriteStack(postfix) FailIfEncrypted([op1, op2, op3]) return '%s(%s, %s, %s)' % (func_name, op1, op2, op3) else: raise bigquery_client.BigqueryInvalidQueryError( '%s function does not exist.' % func_name, None, None, None) elif not isinstance(top, basestring): return str(top) else: return top temp_stack = list(stack) new_expression = CheckAndRewriteStack(temp_stack) if temp_stack: raise bigquery_client.BigqueryInvalidQueryError( 'Too many arguments.', None, None, None) return new_expression
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile): """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file.""" prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: num_columns = len(schema) csv_writer = csv.writer(out_file) _ValidateCsvDataFile(schema, infile) csv_reader = _Utf8CsvReader(in_file, csv_writer) for row in csv_reader: new_row = [] if len(row) != num_columns: raise EncryptConvertError( 'Number of fields in schema do not match ' 'in row: %s' % row) for i in xrange(num_columns): encrypt_mode = schema[i]['encrypt'] if encrypt_mode == 'none': new_row.append(row[i].encode('utf-8')) elif encrypt_mode == 'probabilistic': new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'pseudonym': new_row.append( pseudonym_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'integer': new_row.append( homomorphic_int_cipher.Encrypt(long( row[i])).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'float': new_row.append( homomorphic_float_cipher.Encrypt(float( row[i])).encode('utf-8')) elif encrypt_mode == 'searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) elif encrypt_mode == 'probabilistic_searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) csv_writer.writerow(new_row)
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list, manifest=None): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. manifest: optional, query_lib.QueryManifest instance. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] # If a manifest is supplied rewrite the column names according to any # computed aliases that were used. Otherwise, resort to the old scheme # of substituting the '.' in multidimensional schemas in/out. if manifest is not None: for i in xrange(len(fields)): # TODO(user): This is a hash lookup on every column name. # The lookup is efficient and the column names are sufficiently random # as compared to likely human language column names such that false # hits should not be possible. However this may need future revision. n = manifest.GetColumnNameForAlias(fields[i]['name']) if n is not None: fields[i]['name'] = n else: for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace( util.PERIOD_REPLACEMENT, '.') for i in xrange(len(fields)): encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace(util.PERIOD_REPLACEMENT, '.') encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values