def testLargeNumberOfBlobs(self): def Blobs(prefix): for idx in range(1337): yield prefix + str(idx).encode("ascii") foo_blobs = list(Blobs(b"foo")) foo_blob_refs = _BlobRefsFromByteArray(foo_blobs) foo_blob_ids = [ref.blob_id for ref in foo_blob_refs] foo_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(foo_blobs)) data_store.BLOBS.WriteBlobs(dict(zip(foo_blob_ids, foo_blobs))) bar_blobs = list(Blobs(b"bar")) bar_blob_refs = _BlobRefsFromByteArray(bar_blobs) bar_blob_ids = [ref.blob_id for ref in bar_blob_refs] bar_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(bar_blobs)) data_store.BLOBS.WriteBlobs(dict(zip(bar_blob_ids, bar_blobs))) client_id = self.SetupClient(0) foo_path = db.ClientPath.OS(client_id=client_id, components=("foo", )) bar_path = db.ClientPath.OS(client_id=client_id, components=("bar", )) with mock.patch.object(file_store, "_BLOBS_READ_BATCH_SIZE", 42): hash_ids = file_store.AddFilesWithUnknownHashes({ foo_path: foo_blob_refs, bar_path: bar_blob_refs, }) self.assertLen(hash_ids, 2) self.assertEqual(hash_ids[foo_path], foo_hash_id) self.assertEqual(hash_ids[bar_path], bar_hash_id)
def _WriteFilesContentRel(self, responses): """Writes file contents of multiple files to the relational database.""" client_path_blob_refs = dict() client_path_path_info = dict() for response in responses: path_info = rdf_objects.PathInfo.FromStatEntry(response.stat_entry) chunks = response.transferred_file.chunks chunks = sorted(chunks, key=lambda _: _.offset) client_path = db.ClientPath.FromPathInfo(self.client_id, path_info) blob_refs = [] for c in chunks: blob_refs.append( rdf_objects.BlobReference( offset=c.offset, size=c.length, blob_id=rdf_objects.BlobID.FromBytes(c.digest))) client_path_path_info[client_path] = path_info client_path_blob_refs[client_path] = blob_refs if (data_store.RelationalDBReadEnabled() and client_path_blob_refs): use_external_stores = self.args.action.download.use_external_stores client_path_hash_id = file_store.AddFilesWithUnknownHashes( client_path_blob_refs, use_external_stores=use_external_stores) for client_path, hash_id in iteritems(client_path_hash_id): path_info = client_path_path_info[client_path] path_info.hash_entry.sha256 = hash_id.AsBytes() path_infos = list(itervalues(client_path_path_info)) data_store.REL_DB.WritePathInfos(self.client_id, path_infos)
def testSimpleOverlappingBlobIds(self): foo_blobs = [b"foo", b"norf", b"quux", b"thud"] bar_blobs = [b"bar", b"norf", b"blag", b"thud"] foo_blob_refs = _BlobRefsFromByteArray(foo_blobs) foo_blob_ids = [ref.blob_id for ref in foo_blob_refs] foo_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(foo_blobs)) bar_blob_refs = _BlobRefsFromByteArray(bar_blobs) bar_blob_ids = [ref.blob_id for ref in bar_blob_refs] bar_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(bar_blobs)) data_store.BLOBS.WriteBlobs(dict(zip(foo_blob_ids, foo_blobs))) data_store.BLOBS.WriteBlobs(dict(zip(bar_blob_ids, bar_blobs))) client_id = self.SetupClient(0) foo_path = db.ClientPath.OS(client_id=client_id, components=("foo", "quux")) bar_path = db.ClientPath.OS(client_id=client_id, components=("bar", "blag")) hash_ids = file_store.AddFilesWithUnknownHashes({ foo_path: foo_blob_refs, bar_path: bar_blob_refs, }) self.assertLen(hash_ids, 2) self.assertEqual(hash_ids[foo_path], foo_hash_id) self.assertEqual(hash_ids[bar_path], bar_hash_id)
def _WriteFilesContentRel(self, responses): """Writes file contents of multiple files to the relational database.""" client_path_blob_ids = dict() client_path_path_info = dict() for response in responses: path_info = rdf_objects.PathInfo.FromStatEntry(response.stat_entry) chunks = response.transferred_file.chunks chunks = sorted(chunks, key=lambda _: _.offset) client_path = db.ClientPath.FromPathInfo(self.client_id, path_info) blob_ids = [rdf_objects.BlobID.FromBytes(c.digest) for c in chunks] client_path_blob_ids[client_path] = blob_ids client_path_path_info[client_path] = path_info if data_store.RelationalDBReadEnabled("filestore"): client_path_hash_id = file_store.AddFilesWithUnknownHashes( client_path_blob_ids) for client_path, hash_id in iteritems(client_path_hash_id): client_path_path_info[ client_path].hash_entry.sha256 = hash_id.AsBytes() path_infos = list(itervalues(client_path_path_info)) data_store.REL_DB.WritePathInfos(self.client_id, path_infos)
def testFilesWithOneBlobAreStillReadToEnsureBlobExists(self): _, long_blob_refs = vfs_test_lib.GenerateBlobRefs(self.blob_size, "cd") _, short_blob_refs1 = vfs_test_lib.GenerateBlobRefs( self.blob_size, "a") _, short_blob_refs2 = vfs_test_lib.GenerateBlobRefs( self.blob_size, "b") path1 = db.ClientPath.OS(self.client_id, ["foo"]) path2 = db.ClientPath.OS(self.client_id, ["bar"]) path3 = db.ClientPath.OS(self.client_id, ["baz"]) # One small file, blob is still read. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFileWithUnknownHash(path1, short_blob_refs1) p.assert_called_once() # Same for multiple small files. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFilesWithUnknownHashes({ path1: short_blob_refs1, path2: short_blob_refs2 }) p.assert_called_once() # One large file and two small ones result in a single read for the # all three blobs. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFilesWithUnknownHashes({ path1: short_blob_refs1, path2: short_blob_refs2, path3: long_blob_refs }) p.assert_called_once() self.assertLen(p.call_args[POSITIONAL_ARGS], 1) self.assertEmpty(p.call_args[KEYWORD_ARGS]) self.assertCountEqual(p.call_args[0][0], [ r.blob_id for r in itertools.chain( short_blob_refs1, short_blob_refs2, long_blob_refs) ])
def testOptimizationForSmallFiles(self): _, long_blob_refs = _GenerateBlobRefs(self.blob_size, b"ab") _, short_blob_refs1 = _GenerateBlobRefs(self.blob_size, b"a") _, short_blob_refs2 = _GenerateBlobRefs(self.blob_size, b"b") path1 = db.ClientPath.OS(self.client_id, ["foo"]) path2 = db.ClientPath.OS(self.client_id, ["bar"]) path3 = db.ClientPath.OS(self.client_id, ["baz"]) # One small file, no need to read blobs. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFileWithUnknownHash(path1, short_blob_refs1) p.assert_not_called() # Same for multiple small files. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFilesWithUnknownHashes({ path1: short_blob_refs1, path2: short_blob_refs2 }) p.assert_not_called() # One large file and two small ones result in a single read for the two # blobs of the large file only. with mock.patch.object(data_store.BLOBS, "ReadBlobs", wraps=data_store.BLOBS.ReadBlobs) as p: file_store.AddFilesWithUnknownHashes({ path1: short_blob_refs1, path2: short_blob_refs2, path3: long_blob_refs }) p.assert_called_once() self.assertLen(p.call_args[POSITIONAL_ARGS], 1) self.assertEmpty(p.call_args[KEYWORD_ARGS]) self.assertCountEqual(p.call_args[0][0], [r.blob_id for r in long_blob_refs])
def testDoesNotFailForEmptyFiles(self): client_id = self.SetupClient(0) paths = [] for idx in range(100): components = ("foo", "bar", str(idx)) paths.append(db.ClientPath.OS(client_id=client_id, components=components)) hash_ids = file_store.AddFilesWithUnknownHashes( {path: [] for path in paths}) empty_hash_id = rdf_objects.SHA256HashID.FromData(b"") for path in paths: self.assertEqual(hash_ids[path], empty_hash_id)
def testLargeNumberOfPaths(self): client_id = self.SetupClient(0).Basename() paths = [] for idx in range(1337): components = ("foo", "bar", str(idx)) paths.append(db.ClientPath.OS(client_id=client_id, components=components)) blobs = [b"foo", b"bar", b"baz"] blob_ids = list(map(rdf_objects.BlobID.FromBlobData, blobs)) data_store.BLOBS.WriteBlobs(dict(zip(blob_ids, blobs))) hash_ids = file_store.AddFilesWithUnknownHashes( {path: blob_ids for path in paths}) expected_hash_id = rdf_objects.SHA256HashID.FromData(b"foobarbaz") for path in paths: self.assertEqual(hash_ids[path], expected_hash_id)
def testSimpleMultiplePaths(self): foo_blobs = [b"foo", b"norf", b"thud"] foo_blob_ids = list(map(rdf_objects.BlobID.FromBlobData, foo_blobs)) foo_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(foo_blobs)) data_store.BLOBS.WriteBlobs(dict(zip(foo_blob_ids, foo_blobs))) bar_blobs = [b"bar", b"quux", b"blargh"] bar_blob_ids = list(map(rdf_objects.BlobID.FromBlobData, bar_blobs)) bar_hash_id = rdf_objects.SHA256HashID.FromData(b"".join(bar_blobs)) data_store.BLOBS.WriteBlobs(dict(zip(bar_blob_ids, bar_blobs))) client_id = self.SetupClient(0).Basename() foo_path = db.ClientPath.OS(client_id=client_id, components=("foo",)) bar_path = db.ClientPath.OS(client_id=client_id, components=("bar",)) hash_ids = file_store.AddFilesWithUnknownHashes({ foo_path: foo_blob_ids, bar_path: bar_blob_ids, }) self.assertLen(hash_ids, 2) self.assertEqual(hash_ids[foo_path], foo_hash_id) self.assertEqual(hash_ids[bar_path], bar_hash_id)
def testDoesNotFailForEmptyDict(self): file_store.AddFilesWithUnknownHashes({})