def testHashLongStrings(self): # very very long strings long_string1 = get_long_string([ ' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}" ], 5000) long_string2 = get_long_string([ 'This is a very string. \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '" ], 5000) hash1 = anonymize.hash_string(long_string1, 0, True) hash2 = anonymize.hash_string(long_string2, 0, True) long_json = '{"key1" : "%s", "key2" : "%s"}' % (long_string1, long_string2) expected_sanitized_long_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2) real_sanitized_long_json = self.s.sanitize(long_json, 0) self.assertEqual(expected_sanitized_long_json, real_sanitized_long_json)
def testHashLongStrings(self): # very very long strings long_string1 = get_long_string([' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}"], 5000) long_string2 = get_long_string(['This is a very string. \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '"], 5000) hash1 = anonymize.hash_string(long_string1, 0, True) hash2 = anonymize.hash_string(long_string2, 0, True) long_json = '{"key1" : "%s", "key2" : "%s"}' % (long_string1, long_string2) expected_sanitized_long_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2) real_sanitized_long_json = self.s.sanitize(long_json, 0) self.assertEqual(expected_sanitized_long_json, real_sanitized_long_json)
def testHashMultiline(self): # short strings short_string1 = get_long_string([' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}"], 1) short_string2 = get_long_string(['This is not a long string. \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '"], 1) hash1 = anonymize.hash_string(short_string1, 0, True) hash2 = anonymize.hash_string(short_string2, 0, True) json = '{"key1" : "%s", "key2" : "%s"}' % (short_string1, short_string2) # print json expected_sanitized_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2) real_sanitized_json = self.s.sanitize(json, 0) self.assertEqual(expected_sanitized_json, real_sanitized_json)
def infer_salt(self, candidate_hashes, known_collections): """this is a ridiculous hack. Let's hope the salt is 0. But even if not...""" max_salt = 100000000 #if self.debug: LOG.info("Trying to brute-force the salt 0-%d [numCollections=%d / numHashes=%d]", \ max_salt, len(known_collections), len(candidate_hashes)) salt = 0 # the col names are hashed with quotes around them col_names = map(self.get_hash_string, known_collections) while True: if salt % (max_salt / 100) == 0 and salt > 0: LOG.info("SEARCH: salt=%d [%.1f%%]", salt, (salt / float(max_salt)) * 100) for known_col in col_names: hash = anonymize.hash_string( known_col, salt) # imported from anonymize.py if hash in candidate_hashes: #if self.debug: print LOG.info("SUCCESS! %s hashes to a known value. SALT: %d", known_col, salt) return salt salt += 1 if salt > max_salt: break if self.debug: print LOG.warn("FAIL. The salt value is unknown") return None
def infer_salt(self, candidate_hashes, known_collections): """this is a ridiculous hack. Let's hope the salt is 0. But even if not...""" max_salt = 100000000 #if self.debug: LOG.info("Trying to brute-force the salt 0-%d [numCollections=%d / numHashes=%d]", \ max_salt, len(known_collections), len(candidate_hashes)) salt = 0 # the col names are hashed with quotes around them col_names = map(self.get_hash_string, known_collections) while True: if salt % (max_salt / 100) == 0 and salt > 0: LOG.info("SEARCH: salt=%d [%.1f%%]", salt, (salt / float(max_salt))*100) for known_col in col_names: hash = anonymize.hash_string(known_col, salt) # imported from anonymize.py if hash in candidate_hashes: #if self.debug: print LOG.info("SUCCESS! %s hashes to a known value. SALT: %d", known_col, salt) return salt salt += 1 if salt > max_salt: break if self.debug: print LOG.warn("FAIL. The salt value is unknown") return None
def testHashStringMany(self): # many strings in json s = anonymize.Sanitizer(None, None, True) text = 'string with \\\"escaped quotes\\\"' hashed_text = anonymize.hash_string(text, 0, True) long_json = "{" + get_long_string(['"key" : "%s", ' % text], 4000) + "}" expected_result = "{" + get_long_string(['"key" : %s, ' % hashed_text], 4000) + "}" real_result = self.s.sanitize(long_json, 0) #print long_json #print expected_result #print real_result self.assertEqual(expected_result, real_result)
def testHashMultiline(self): # short strings short_string1 = get_long_string([ ' \\"hello\\" ', '\n\n \t\t \n\t \t\n', 'email: \\"[email protected]\\"', '\'\'\'\'', "'\\\"'", "{inside: \\\"of a string\\\"}" ], 1) short_string2 = get_long_string([ 'This is not a long string. \n\n \t', '\n\t', '{\\"name\\"}', ' \\ END', '\\"string\\"', "'string'", "\\\"'quotes'\\\"", "open quotes: \\\" '" ], 1) hash1 = anonymize.hash_string(short_string1, 0, True) hash2 = anonymize.hash_string(short_string2, 0, True) json = '{"key1" : "%s", "key2" : "%s"}' % (short_string1, short_string2) # print json expected_sanitized_json = '{"key1" : %s, "key2" : %s}' % (hash1, hash2) real_sanitized_json = self.s.sanitize(json, 0) self.assertEqual(expected_sanitized_json, real_sanitized_json)
def postProcess(self): """Process the operations to fix the collection names used in aggregate queries""" if not self.known_collections: LOG.warn( "No plaintext collections were found in operations. Unable to perform post-processing" ) return if self.no_salt_search: LOG.warn("Skipping post-processing") return if self.debug: LOG.debug( "Performing post processing on %s sessions with %d operations" % (self.getSessionCount(), self.getOpCount())) LOG.debug("-- Aggregate Collection Names --") LOG.debug("Encountered %d collection names in plaintext." % len(self.known_collections)) LOG.debug(pformat(self.known_collections)) # Find candidate_hashes = self.get_candidate_hashes() # HACK: Figure out what salt was used so that we can match # them with our known collection names salt = self.infer_salt(candidate_hashes, self.known_collections) if salt is None: LOG.warn( "Failed to find string hashing salt. Unable to fix aggregate collection names" ) return # Now for the given salt value, populate a mapping from # hashes to collection names LOG.debug("Pre-computing hashes for all known collection names...") hashed_collections = {} # hash --> collection name for col_name in self.known_collections: hash = anonymize.hash_string(self.get_hash_string(col_name), salt) hashed_collections[hash] = col_name if self.debug: LOG.debug("hash: %s / col_name: %s / hash_str: %s" % (hash, col_name, get_hash_string(col_name))) ## FOR # Now use our hash xref to fix the collection names in all aggreate operations self.fix_collection_names(hashed_collections)
def postProcess(self): """Process the operations to fix the collection names used in aggregate queries""" if not self.known_collections: LOG.warn("No plaintext collections were found in operations. Unable to perform post-processing") return if self.no_salt_search: LOG.warn("Skipping post-processing") return if self.debug: LOG.debug("Performing post processing on %s sessions with %d operations" % (self.getSessionCount(), self.getOpCount())) LOG.debug("-- Aggregate Collection Names --") LOG.debug("Encountered %d collection names in plaintext." % len(self.known_collections)) LOG.debug(pformat(self.known_collections)) # Find candidate_hashes = self.get_candidate_hashes() # HACK: Figure out what salt was used so that we can match # them with our known collection names salt = self.infer_salt(candidate_hashes, self.known_collections) if salt is None: LOG.warn("Failed to find string hashing salt. Unable to fix aggregate collection names") return # Now for the given salt value, populate a mapping from # hashes to collection names LOG.debug("Pre-computing hashes for all known collection names...") hashed_collections = {} # hash --> collection name for col_name in self.known_collections: hash = anonymize.hash_string(self.get_hash_string(col_name), salt) hashed_collections[hash] = col_name if self.debug: LOG.debug("hash: %s / col_name: %s / hash_str: %s" % (hash, col_name, get_hash_string(col_name))) ## FOR # Now use our hash xref to fix the collection names in all aggreate operations self.fix_collection_names(hashed_collections)
def testHashStringSimple(self): # other tests str1 = "\"THIS SHOULD BE SIMPLY HASHED\"" hash1 = anonymize.hash_string("THIS SHOULD BE SIMPLY HASHED", 0, True) result1 = self.s.sanitize(str1, 0) self.assertEqual(hash1, result1)