def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField( data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) rewritten_data = json.dumps(rewritten_data) out_file.write(rewritten_data + '\n')
def ConvertJsonDataFile(schema, master_key, table_id, infile, outfile): """Encrypts data in a json file based on schema provided. Arguments: schema: User defined values and types. master_key: Key to provide ciphers. table_id: Used to unique key for each table. infile: File to be encrypted. outfile: Location of encrypted file to outputted. """ prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) _ValidateJsonDataFile(schema, infile) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: for line in in_file: data = json.loads(line) data = _StrToUnicode(data) rewritten_data = _ConvertJsonField(data, schema, prob_cipher, pseudonym_cipher, string_hasher, homomorphic_int_cipher, homomorphic_float_cipher) # When python prints unicode strings, it uses single quotes and # prepends a u before the string (such as u'Hello'). Json does # understand this and will only allow strings of double quotes # without any prefixes, therefore we must substitute to fit # the criteria. rewritten_data = str(rewritten_data).replace('u\'', '"') rewritten_data = rewritten_data.replace('\'', '"') out_file.write(rewritten_data + '\n')
def setUp(self): """Run once for each test in the class.""" self.cipher = ecrypto.HomomorphicFloatCipher(_KEY1)
def ConvertCsvDataFile(schema, master_key, table_id, infile, outfile): """Reads utf8 csv data, encrypts and stores into a new csv utf8 data file.""" prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) # TODO(user): ciphers and hash should not use the same key. string_hasher = ecrypto.StringHash( ecrypto.GenerateStringHashKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) with open(infile, 'rb') as in_file: with open(outfile, 'wb') as out_file: num_columns = len(schema) csv_writer = csv.writer(out_file) _ValidateCsvDataFile(schema, infile) csv_reader = _Utf8CsvReader(in_file, csv_writer) for row in csv_reader: new_row = [] if len(row) != num_columns: raise EncryptConvertError( 'Number of fields in schema do not match ' 'in row: %s' % row) for i in xrange(num_columns): encrypt_mode = schema[i]['encrypt'] if encrypt_mode == 'none': new_row.append(row[i].encode('utf-8')) elif encrypt_mode == 'probabilistic': new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'pseudonym': new_row.append( pseudonym_cipher.Encrypt(row[i]).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'integer': new_row.append( homomorphic_int_cipher.Encrypt(long( row[i])).encode('utf-8')) elif encrypt_mode == 'homomorphic' and schema[i][ 'type'] == 'float': new_row.append( homomorphic_float_cipher.Encrypt(float( row[i])).encode('utf-8')) elif encrypt_mode == 'searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) elif encrypt_mode == 'probabilistic_searchwords': if 'searchwords_separator' in schema[i]: searchwords_separator = schema[i][ 'searchwords_separator'] else: searchwords_separator = None if 'max_word_sequence' in schema[i]: max_word_sequence = schema[i]['max_word_sequence'] else: max_word_sequence = 5 new_row.append( string_hasher.GetHashesForWordSubsequencesWithIv( util.SEARCHWORDS_PREFIX + schema[i]['name'], row[i], separator=searchwords_separator, max_sequence_len=max_word_sequence).encode( 'utf-8')) new_row.append( prob_cipher.Encrypt(row[i]).encode('utf-8')) csv_writer.writerow(new_row)
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list, manifest=None): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. manifest: optional, query_lib.QueryManifest instance. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] # If a manifest is supplied rewrite the column names according to any # computed aliases that were used. Otherwise, resort to the old scheme # of substituting the '.' in multidimensional schemas in/out. if manifest is not None: for i in xrange(len(fields)): # TODO(user): This is a hash lookup on every column name. # The lookup is efficient and the column names are sufficiently random # as compared to likely human language column names such that false # hits should not be possible. However this may need future revision. n = manifest.GetColumnNameForAlias(fields[i]['name']) if n is not None: fields[i]['name'] = n else: for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace( util.PERIOD_REPLACEMENT, '.') for i in xrange(len(fields)): encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values
def _DecryptRows(fields, rows, master_key, table_id, schema, query_list, aggregation_query_list, unencrypted_query_list): """Decrypts all values in rows. Arguments: fields: Column names. rows: Table values. master_key: Key to get ciphers. table_id: Used to generate keys. schema: Represents information about fields. query_list: List of fields that were queried. aggregation_query_list: List of aggregations of fields that were queried. unencrypted_query_list: List of unencrypted expressions. Returns: A dictionary that returns for each query, a list of decrypted values. Raises: bigquery_client.BigqueryInvalidQueryError: User trying to query for a SEARCHWORD encrypted field. SEARCHWORD encrypted fields cannot be decrypted. """ # create ciphers for decryption prob_cipher = ecrypto.ProbabilisticCipher( ecrypto.GenerateProbabilisticCipherKey(master_key, table_id)) pseudonym_cipher = ecrypto.PseudonymCipher( ecrypto.GeneratePseudonymCipherKey(master_key, table_id)) homomorphic_int_cipher = ecrypto.HomomorphicIntCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) homomorphic_float_cipher = ecrypto.HomomorphicFloatCipher( ecrypto.GenerateHomomorphicCipherKey(master_key, table_id)) ciphers = { util.PROBABILISTIC_PREFIX: prob_cipher, util.PSEUDONYM_PREFIX: pseudonym_cipher, util.HOMOMORPHIC_INT_PREFIX: homomorphic_int_cipher, util.HOMOMORPHIC_FLOAT_PREFIX: homomorphic_float_cipher, } queried_values = {} for query in query_list: if len(query.split(' ')) >= 3 and query.split(' ')[-2] == 'AS': queried_values[' '.join(query.split(' ')[:-2])] = [] else: queried_values[query] = [] for query in aggregation_query_list: queried_values[query] = [] for i in xrange(len(unencrypted_query_list)): queried_values['%s%d_' % (util.UNENCRYPTED_ALIAS_PREFIX, i)] = [] for i in xrange(len(fields)): fields[i]['name'] = fields[i]['name'].replace(util.PERIOD_REPLACEMENT, '.') encrypted_name = fields[i]['name'].split('.')[-1] if fields[i]['type'] == 'TIMESTAMP': queried_values[fields[i]['name']] = _GetTimestampValues(rows, i) elif encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Cannot decrypt searchwords encryption. Decryption of SEARCHWORDS ' 'is limited to PROBABILISTIC_SEARCHWORDS encryption.', None, None, None) elif encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif encrypted_name.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[fields[i]['name']] = (_DecryptValues( fields[i]['name'], rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) elif (encrypted_name.startswith(util.UNENCRYPTED_ALIAS_PREFIX) and encrypted_name.endswith('_')): queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) elif encrypted_name.startswith('f') and encrypted_name.endswith('_'): index = int(fields[i]['name'][1:-1]) original_fieldname = aggregation_query_list[index] original_fieldname = original_fieldname.strip() if (len(original_fieldname.split(' ')) >= 3 and original_fieldname.split(' ')[-2].lower() == 'within'): actual_field = original_fieldname.split(' ')[:-2] actual_field = ' '.join(actual_field) else: actual_field = original_fieldname if original_fieldname.startswith(util.GROUP_CONCAT_PREFIX): concat_field = actual_field.split( util.GROUP_CONCAT_PREFIX)[1][:-1].strip() encrypted_name = concat_field.split('.')[-1] if encrypted_name.startswith(util.PROBABILISTIC_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PROBABILISTIC_PREFIX)) elif encrypted_name.startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = ( _DecryptGroupConcatValues(original_fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) elif (encrypted_name.startswith(util.HOMOMORPHIC_INT_PREFIX) or encrypted_name.startswith( util.HOMOMORPHIC_FLOAT_PREFIX)): raise bigquery_client.BigqueryInvalidQueryError( 'GROUP_CONCAT only accepts string type.', None, None, None) elif encrypted_name.startswith(util.SEARCHWORDS_PREFIX): raise bigquery_client.BigqueryInvalidQueryError( 'Invalid query, cannot recover searchwords encryption.', None, None, None) else: for j in xrange(len(rows)): queried_values[original_fieldname].append(rows[j][i]) elif (original_fieldname.startswith('COUNT(') or original_fieldname.startswith('AVG(') or original_fieldname.startswith('SUM(')): queried_values[original_fieldname] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) elif original_fieldname.startswith('TOP('): fieldname = actual_field.split('TOP(')[1][:-1].strip() fieldname = fieldname.split(',')[0].strip() if fieldname.split('.')[-1].startswith(util.PSEUDONYM_PREFIX): queried_values[original_fieldname] = (_DecryptValues( fieldname, rows, i, ciphers, schema, util.PSEUDONYM_PREFIX)) else: queried_values[original_fieldname] = ( _GetUnencryptedValues(original_fieldname, rows, i, schema)) elif original_fieldname.startswith(util.PAILLIER_SUM_PREFIX): sum_argument = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] sum_argument = sum_argument.split(',')[0][:-1] sum_argument = sum_argument.split('.')[-1] real_fieldname = original_fieldname.split( util.PAILLIER_SUM_PREFIX)[1] real_fieldname = real_fieldname.split(',')[0][:-1] if sum_argument.startswith(util.HOMOMORPHIC_INT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_INT_PREFIX)) elif sum_argument.startswith(util.HOMOMORPHIC_FLOAT_PREFIX): queried_values[original_fieldname] = (_DecryptValues( real_fieldname, rows, i, ciphers, schema, util.HOMOMORPHIC_FLOAT_PREFIX)) else: queried_values[fields[i]['name']] = ( _GetUnencryptedValuesWithType(rows, i, fields[i]['type'])) else: queried_values[fields[i]['name']] = (_GetUnencryptedValuesWithType( rows, i, fields[i]['type'])) return queried_values