Пример #1
0
    def testMultipleHashBlobReferencesCanBeWrittenAndReadBack(self):
        blob_ref_1 = rdf_objects.BlobReference(offset=0,
                                               size=42,
                                               blob_id=rdf_objects.BlobID(
                                                   b"01234567" * 4))
        blob_ref_2 = rdf_objects.BlobReference(offset=42,
                                               size=42,
                                               blob_id=rdf_objects.BlobID(
                                                   b"01234568" * 4))

        hash_id_1 = rdf_objects.SHA256HashID(b"0a1b2c3d" * 4)
        hash_id_2 = rdf_objects.SHA256HashID(b"0a1b2c3e" * 4)

        data = {
            hash_id_1: [blob_ref_1],
            hash_id_2: [blob_ref_1, blob_ref_2],
        }
        self.db.WriteHashBlobReferences(data)

        results = self.db.ReadHashBlobReferences([hash_id_1, hash_id_2])
        self.assertEqual(results, data)

        results = self.db.ReadHashBlobReferences([hash_id_1])
        self.assertEqual(results, {hash_id_1: data[hash_id_1]})

        results = self.db.ReadHashBlobReferences([hash_id_2])
        self.assertEqual(results, {hash_id_2: data[hash_id_2]})
Пример #2
0
  def testCorrectlyHandlesRequestWithOneExistingAndOneMissingHash(self):
    blob_ref = rdf_objects.BlobReference(
        offset=0, size=42, blob_id=rdf_objects.BlobID(b"01234567" * 4))
    hash_id = rdf_objects.SHA256HashID(b"0a1b2c3d" * 4)

    self.db.WriteHashBlobReferences({hash_id: [blob_ref]})

    missing_hash_id = rdf_objects.SHA256HashID(b"00000000" * 4)

    results = self.db.ReadHashBlobReferences([missing_hash_id, hash_id])
    self.assertEqual(results, {
        hash_id: [blob_ref],
        missing_hash_id: None,
    })
Пример #3
0
    def testReportsNonExistingHashesAsNone(self):
        d = self.db

        hash_id = rdf_objects.SHA256HashID(b"0a1b2c3d" * 4)

        results = d.ReadHashBlobReferences([hash_id])
        self.assertEqual(results, {hash_id: None})
Пример #4
0
  def testHashBlobReferenceCanBeWrittenAndReadBack(self):
    blob_ref = rdf_objects.BlobReference(
        offset=0, size=42, blob_id=rdf_objects.BlobID(b"01234567" * 4))
    hash_id = rdf_objects.SHA256HashID(b"0a1b2c3d" * 4)

    data = {hash_id: [blob_ref]}
    self.db.WriteHashBlobReferences(data)

    results = self.db.ReadHashBlobReferences([hash_id])
    self.assertEqual(results, data)
Пример #5
0
    def testWriteHashBlobHandlesLargeAmountsOfData(self):
        hash_id_blob_refs = {}

        for _ in range(50000):
            hash_id = rdf_objects.SHA256HashID(os.urandom(32))

            blob_ref = rdf_objects.BlobReference()
            blob_ref.blob_id = rdf_objects.BlobID(os.urandom(32))
            blob_ref.offset = random.randint(0, 1024 * 1024 * 1024)
            blob_ref.size = random.randint(128, 256)

            hash_id_blob_refs[hash_id] = [blob_ref]

        self.db.WriteHashBlobReferences(hash_id_blob_refs)

        hash_ids = list(hash_id_blob_refs.keys())
        read_hash_id_blob_refs = self.db.ReadHashBlobReferences(hash_ids)
        self.assertEqual(read_hash_id_blob_refs, hash_id_blob_refs)
Пример #6
0
    def _CheckHashesWithFileStore(self):
        """Check all queued up hashes for existence in file store.

    Hashes which do not exist in the file store will be downloaded. This
    function flushes the entire queue (self.state.pending_hashes) in order to
    minimize the round trips to the file store.

    If a file was found in the file store it is not scheduled for collection
    and its PathInfo is written to the datastore pointing to the file store's
    hash. Otherwise, we request the client to hash every block in the file,
    and add it to the file tracking queue (self.state.pending_files).
    """
        if not self.state.pending_hashes:
            return

        # This map represents all the hashes in the pending urns.
        file_hashes = {}

        # Store a mapping of hash to tracker. Keys are hashdigest objects,
        # values are arrays of tracker dicts.
        hash_to_tracker = {}
        for index, tracker in iteritems(self.state.pending_hashes):
            # We might not have gotten this hash yet
            if tracker.get("hash_obj") is None:
                continue

            hash_obj = tracker["hash_obj"]
            digest = hash_obj.sha256
            file_hashes[index] = hash_obj
            hash_to_tracker.setdefault(rdf_objects.SHA256HashID(digest),
                                       []).append(tracker)

        # First we get all the files which are present in the file store.
        files_in_filestore = set()

        statuses = file_store.CheckHashes([
            rdf_objects.SHA256HashID.FromSerializedBytes(ho.sha256.AsBytes())
            for ho in itervalues(file_hashes)
        ])
        for hash_id, status in iteritems(statuses):
            self.HeartBeat()

            if not status:
                continue

            # Since checkhashes only returns one digest per unique hash we need to
            # find any other files pending download with the same hash.
            for tracker in hash_to_tracker[hash_id]:
                self.state.files_skipped += 1
                file_hashes.pop(tracker["index"])
                files_in_filestore.add(hash_id)
                # Remove this tracker from the pending_hashes store since we no longer
                # need to process it.
                self.state.pending_hashes.pop(tracker["index"])

        # Now that the check is done, reset our counter
        self.state.files_hashed_since_check = 0
        # Now copy all existing files to the client aff4 space.
        for hash_id in files_in_filestore:

            for file_tracker in hash_to_tracker.get(hash_id, []):
                stat_entry = file_tracker["stat_entry"]
                path_info = rdf_objects.PathInfo.FromStatEntry(stat_entry)
                path_info.hash_entry = file_tracker["hash_obj"]
                data_store.REL_DB.WritePathInfos(self.client_id, [path_info])

                # Report this hit to the flow's caller.
                self._ReceiveFetchedFile(file_tracker)

        # Now we iterate over all the files which are not in the store and arrange
        # for them to be copied.
        for index in file_hashes:

            # Move the tracker from the pending hashes store to the pending files
            # store - it will now be downloaded.
            file_tracker = self.state.pending_hashes.pop(index)
            self.state.pending_files[index] = file_tracker

            # If we already know how big the file is we use that, otherwise fall back
            # to the size reported by stat.
            if file_tracker["bytes_read"] > 0:
                file_tracker["size_to_download"] = file_tracker["bytes_read"]
            else:
                file_tracker["size_to_download"] = file_tracker[
                    "stat_entry"].st_size

            # We do not have the file here yet - we need to retrieve it.
            expected_number_of_hashes = file_tracker["expected_chunks"] = (
                file_tracker["size_to_download"] // self.CHUNK_SIZE + 1)

            # We just hash ALL the chunks in the file now. NOTE: This maximizes client
            # VFS cache hit rate and is far more efficient than launching multiple
            # GetFile flows.
            self.state.files_to_fetch += 1

            for i in range(expected_number_of_hashes):
                if i == expected_number_of_hashes - 1:
                    # The last chunk is short.
                    length = file_tracker["size_to_download"] % self.CHUNK_SIZE
                else:
                    length = self.CHUNK_SIZE

                self.CallClient(server_stubs.HashBuffer,
                                pathspec=file_tracker["stat_entry"].pathspec,
                                offset=i * self.CHUNK_SIZE,
                                length=length,
                                next_state=compatibility.GetName(
                                    self.CheckHash),
                                request_data=dict(index=index))

        if self.state.files_hashed % 100 == 0:
            self.Log("Hashed %d files, skipped %s already stored.",
                     self.state.files_hashed, self.state.files_skipped)