def ReadPathInfo(self, client_id, path_type, components, timestamp=None, cursor=None): """Retrieves a path info record for a given path.""" if timestamp is None: path_infos = self.ReadPathInfos(client_id, path_type, [components]) path_info = path_infos[components] if path_info is None: raise db.UnknownPathError(client_id=client_id, path_type=path_type, components=components) return path_info query = """ SELECT directory, UNIX_TIMESTAMP(p.timestamp), stat_entry, UNIX_TIMESTAMP(last_stat_entry_timestamp), hash_entry, UNIX_TIMESTAMP(last_hash_entry_timestamp) FROM client_paths as p LEFT JOIN (SELECT client_id, path_type, path_id, stat_entry FROM client_path_stat_entries WHERE client_id = %(client_id)s AND path_type = %(path_type)s AND path_id = %(path_id)s AND UNIX_TIMESTAMP(timestamp) <= %(timestamp)s ORDER BY timestamp DESC LIMIT 1) AS s ON p.client_id = s.client_id AND p.path_type = s.path_type AND p.path_id = s.path_id LEFT JOIN (SELECT client_id, path_type, path_id, hash_entry FROM client_path_hash_entries WHERE client_id = %(client_id)s AND path_type = %(path_type)s AND path_id = %(path_id)s AND UNIX_TIMESTAMP(timestamp) <= %(timestamp)s ORDER BY timestamp DESC LIMIT 1) AS h ON p.client_id = h.client_id AND p.path_type = h.path_type AND p.path_id = h.path_id WHERE p.client_id = %(client_id)s AND p.path_type = %(path_type)s AND p.path_id = %(path_id)s """ values = { "client_id": db_utils.ClientIDToInt(client_id), "path_type": int(path_type), "path_id": rdf_objects.PathID.FromComponents(components).AsBytes(), "timestamp": mysql_utils.RDFDatetimeToTimestamp(timestamp), } cursor.execute(query, values) row = cursor.fetchone() if row is None: raise db.UnknownPathError(client_id=client_id, path_type=path_type, components=components) # pyformat: disable (directory, timestamp, stat_entry_bytes, last_stat_entry_timestamp, hash_entry_bytes, last_hash_entry_timestamp) = row # pyformat: enable if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedString( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedString(hash_entry_bytes) else: hash_entry = None datetime = mysql_utils.TimestampToRDFDatetime return rdf_objects.PathInfo( path_type=path_type, components=components, timestamp=datetime(timestamp), last_stat_entry_timestamp=datetime(last_stat_entry_timestamp), last_hash_entry_timestamp=datetime(last_hash_entry_timestamp), directory=directory, stat_entry=stat_entry, hash_entry=hash_entry)
def ReadLatestPathInfosWithHashBlobReferences(self, client_paths, max_timestamp=None, cursor=None): """Returns PathInfos that have corresponding HashBlobReferences.""" path_infos = {client_path: None for client_path in client_paths} path_id_components = {} for client_path in client_paths: path_id_components[client_path.path_id] = client_path.components params = [] query = """ SELECT t.client_id, t.path_type, t.path_id, UNIX_TIMESTAMP(t.timestamp), s.stat_entry, h.hash_entry FROM (SELECT h.client_id, h.path_type, h.path_id, MAX(h.timestamp) AS timestamp FROM client_path_hash_entries AS h INNER JOIN hash_blob_references AS b ON b.hash_id = h.sha256 WHERE {conditions} GROUP BY client_id, path_type, path_id) AS t LEFT JOIN client_path_stat_entries AS s ON s.client_id = t.client_id AND s.path_type = t.path_type AND s.path_id = t.path_id AND s.timestamp = t.timestamp LEFT JOIN client_path_hash_entries AS h ON h.client_id = t.client_id AND h.path_type = t.path_type AND h.path_id = t.path_id AND h.timestamp = t.timestamp """ path_conditions = [] for client_path in client_paths: path_conditions.append(""" (client_id = %s AND path_type = %s AND path_id = %s) """) params.append(db_utils.ClientIDToInt(client_path.client_id)) params.append(int(client_path.path_type)) params.append(client_path.path_id.AsBytes()) conditions = " OR ".join(path_conditions) if max_timestamp is not None: conditions = "({}) AND UNIX_TIMESTAMP(timestamp) <= %s".format(conditions) params.append(mysql_utils.RDFDatetimeToTimestamp(max_timestamp)) cursor.execute(query.format(conditions=conditions), params) for row in cursor.fetchall(): # pyformat: disable (client_id, path_type, path_id_bytes, timestamp, stat_entry_bytes, hash_entry_bytes) = row # pyformat: enable path_id = rdf_objects.PathID.FromBytes(path_id_bytes) components = path_id_components[path_id] if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedString( stat_entry_bytes) else: stat_entry = None hash_entry = rdf_crypto.Hash.FromSerializedString(hash_entry_bytes) client_path = db.ClientPath( client_id=db_utils.IntToClientID(client_id), path_type=path_type, components=path_id_components[path_id]) path_info = rdf_objects.PathInfo( path_type=path_type, components=components, stat_entry=stat_entry, hash_entry=hash_entry, timestamp=mysql_utils.TimestampToRDFDatetime(timestamp)) path_infos[client_path] = path_info return path_infos
def ReadPathInfos(self, client_id, path_type, components_list, cursor=None): """Retrieves path info records for given paths.""" if not components_list: return {} path_ids = list(map(rdf_objects.PathID.FromComponents, components_list)) path_infos = {components: None for components in components_list} query = """ SELECT path, directory, UNIX_TIMESTAMP(client_paths.timestamp), stat_entry, UNIX_TIMESTAMP(last_stat_entry_timestamp), hash_entry, UNIX_TIMESTAMP(last_hash_entry_timestamp) FROM client_paths LEFT JOIN client_path_stat_entries ON (client_paths.client_id = client_path_stat_entries.client_id AND client_paths.path_type = client_path_stat_entries.path_type AND client_paths.path_id = client_path_stat_entries.path_id AND client_paths.last_stat_entry_timestamp = client_path_stat_entries.timestamp) LEFT JOIN client_path_hash_entries ON (client_paths.client_id = client_path_hash_entries.client_id AND client_paths.path_type = client_path_hash_entries.path_type AND client_paths.path_id = client_path_hash_entries.path_id AND client_paths.last_hash_entry_timestamp = client_path_hash_entries.timestamp) WHERE client_paths.client_id = %(client_id)s AND client_paths.path_type = %(path_type)s AND client_paths.path_id IN %(path_ids)s """ values = { "client_id": db_utils.ClientIDToInt(client_id), "path_type": int(path_type), "path_ids": [path_id.AsBytes() for path_id in path_ids] } cursor.execute(query, values) for row in cursor.fetchall(): # pyformat: disable (path, directory, timestamp, stat_entry_bytes, last_stat_entry_timestamp, hash_entry_bytes, last_hash_entry_timestamp) = row # pyformat: enable components = mysql_utils.PathToComponents(path) if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedString( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedString( hash_entry_bytes) else: hash_entry = None datetime = mysql_utils.TimestampToRDFDatetime path_info = rdf_objects.PathInfo( path_type=path_type, components=components, timestamp=datetime(timestamp), last_stat_entry_timestamp=datetime(last_stat_entry_timestamp), last_hash_entry_timestamp=datetime(last_hash_entry_timestamp), directory=directory, stat_entry=stat_entry, hash_entry=hash_entry) path_infos[components] = path_info return path_infos
def ReadPathInfos(self, client_id, path_type, components_list, cursor=None): """Retrieves path info records for given paths.""" if not components_list: return {} path_ids = list(map(rdf_objects.PathID.FromComponents, components_list)) path_infos = {components: None for components in components_list} query = """ SELECT path, directory, UNIX_TIMESTAMP(client_paths.timestamp), stat_entry, UNIX_TIMESTAMP(last_stat_entry_timestamp), hash_entry, UNIX_TIMESTAMP(last_hash_entry_timestamp) FROM client_paths LEFT JOIN client_path_stat_entries ON (client_paths.client_id = client_path_stat_entries.client_id AND client_paths.path_type = client_path_stat_entries.path_type AND client_paths.path_id = client_path_stat_entries.path_id AND client_paths.last_stat_entry_timestamp = client_path_stat_entries.timestamp) LEFT JOIN client_path_hash_entries ON (client_paths.client_id = client_path_hash_entries.client_id AND client_paths.path_type = client_path_hash_entries.path_type AND client_paths.path_id = client_path_hash_entries.path_id AND client_paths.last_hash_entry_timestamp = client_path_hash_entries.timestamp) WHERE client_paths.client_id = %s AND client_paths.path_type = %s AND client_paths.path_id IN ({}) """.format(", ".join(["%s"] * len(path_ids))) # NOTE: passing tuples as cursor.execute arguments is broken in # mysqldbclient==1.3.10 # (see https://github.com/PyMySQL/mysqlclient-python/issues/145) # and is considered unmaintained. values = [ db_utils.ClientIDToInt(client_id), int(path_type), ] + [path_id.AsBytes() for path_id in path_ids] cursor.execute(query, values) for row in cursor.fetchall(): # pyformat: disable (path, directory, timestamp, stat_entry_bytes, last_stat_entry_timestamp, hash_entry_bytes, last_hash_entry_timestamp) = row # pyformat: enable components = mysql_utils.PathToComponents(path) if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedBytes( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedBytes(hash_entry_bytes) else: hash_entry = None datetime = mysql_utils.TimestampToRDFDatetime path_info = rdf_objects.PathInfo( path_type=path_type, components=components, timestamp=datetime(timestamp), last_stat_entry_timestamp=datetime(last_stat_entry_timestamp), last_hash_entry_timestamp=datetime(last_hash_entry_timestamp), directory=directory, stat_entry=stat_entry, hash_entry=hash_entry) path_infos[components] = path_info return path_infos
def ReadPathInfosHistories( self, client_id: Text, path_type: rdf_objects.PathInfo.PathType, components_list: Iterable[Sequence[Text]], cutoff: Optional[rdfvalue.RDFDatetime] = None, cursor: Optional[MySQLdb.cursors.Cursor] = None ) -> Dict[Sequence[Text], Sequence[rdf_objects.PathInfo]]: """Reads a collection of hash and stat entries for given paths.""" # MySQL does not handle well empty `IN` clauses so we guard against that. if not components_list: return {} path_infos = {components: [] for components in components_list} path_id_components = {} for components in components_list: path_id = rdf_objects.PathID.FromComponents(components) path_id_components[path_id] = components params = { "client_id": db_utils.ClientIDToInt(client_id), "path_type": int(path_type), } for path_id in path_id_components: params["path_id_%s" % path_id.AsHexString()] = path_id.AsBytes() path_id_placeholders = ", ".join([ "%(path_id_{})s".format(path_id.AsHexString()) for path_id in path_id_components ]) if cutoff is not None: stat_entry_timestamp_condition = """ AND s.timestamp <= FROM_UNIXTIME(%(cutoff)s) """ hash_entry_timestamp_condition = """ AND h.timestamp <= FROM_UNIXTIME(%(cutoff)s) """ params["cutoff"] = mysql_utils.RDFDatetimeToTimestamp(cutoff) else: stat_entry_timestamp_condition = "" hash_entry_timestamp_condition = "" # MySQL does not support full outer joins, so we emulate them with a union. query = """ SELECT s.path_id, s.stat_entry, UNIX_TIMESTAMP(s.timestamp), h.path_id, h.hash_entry, UNIX_TIMESTAMP(h.timestamp) FROM client_path_stat_entries AS s LEFT JOIN client_path_hash_entries AS h ON s.client_id = h.client_id AND s.path_type = h.path_type AND s.path_id = h.path_id AND s.timestamp = h.timestamp WHERE s.client_id = %(client_id)s AND s.path_type = %(path_type)s AND s.path_id IN ({path_id_placeholders}) {stat_entry_timestamp_condition} UNION SELECT s.path_id, s.stat_entry, UNIX_TIMESTAMP(s.timestamp), h.path_id, h.hash_entry, UNIX_TIMESTAMP(h.timestamp) FROM client_path_hash_entries AS h LEFT JOIN client_path_stat_entries AS s ON h.client_id = s.client_id AND h.path_type = s.path_type AND h.path_id = s.path_id AND h.timestamp = s.timestamp WHERE h.client_id = %(client_id)s AND h.path_type = %(path_type)s AND h.path_id IN ({path_id_placeholders}) {hash_entry_timestamp_condition} """.format( stat_entry_timestamp_condition=stat_entry_timestamp_condition, hash_entry_timestamp_condition=hash_entry_timestamp_condition, path_id_placeholders=path_id_placeholders) cursor.execute(query, params) for row in cursor.fetchall(): # pyformat: disable (stat_entry_path_id_bytes, stat_entry_bytes, stat_entry_timestamp, hash_entry_path_id_bytes, hash_entry_bytes, hash_entry_timestamp) = row # pyformat: enable path_id_bytes = stat_entry_path_id_bytes or hash_entry_path_id_bytes path_id = rdf_objects.PathID.FromSerializedBytes(path_id_bytes) components = path_id_components[path_id] timestamp = stat_entry_timestamp or hash_entry_timestamp if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedBytes( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedBytes(hash_entry_bytes) else: hash_entry = None path_info = rdf_objects.PathInfo( path_type=path_type, components=components, stat_entry=stat_entry, hash_entry=hash_entry, timestamp=mysql_utils.TimestampToRDFDatetime(timestamp)) path_infos[components].append(path_info) for components in components_list: path_infos[components].sort(key=lambda path_info: path_info.timestamp) return path_infos
def AddFile(self, fd): """Adds a file to the hash file store. We take a file in the client space: aff4:/C.123123123/fs/os/usr/local/blah Hash it, update the hash in the original file if its different to the one calculated on the client, and copy the original AFF4 object to aff4:/files/hash/generic/sha256/123123123 (canonical reference) We then create symlinks for all other hash types: aff4:/files/hash/generic/sha1/345345345 aff4:/files/hash/generic/md5/456456456 aff4:/files/hash/pecoff/md5/aaaaaaaa (only for PEs) aff4:/files/hash/pecoff/sha1/bbbbbbbb (only for PEs) When present in PE files, the signing data (revision, cert_type, certificate) is added to the original object. This can't be done simply in the FileStore.Write() method with fixed hash buffer sizes because the authenticode hashes need to track hashing of different-sized regions based on the signature information. Args: fd: File open for reading. Raises: IOError: If there was an error writing the file. """ hashes = self._HashFile(fd) # The empty file is very common, we don't keep the back references for it # in the DB since it just takes up too much space. empty_hash = ("e3b0c44298fc1c149afbf4c8996fb924" "27ae41e4649b934ca495991b7852b855") if hashes.sha256 == empty_hash: return # Update the hashes field now that we have calculated them all. fd.Set(fd.Schema.HASH, hashes) fd.Flush() if data_store.RelationalDBWriteEnabled(): client_id, vfs_path = fd.urn.Split(2) path_type, components = rdf_objects.ParseCategorizedPath(vfs_path) path_info = rdf_objects.PathInfo(path_type=path_type, components=components, hash_entry=hashes) data_store.REL_DB.WritePathInfos(client_id, [path_info]) # sha256 is the canonical location. canonical_urn = self.PATH.Add("generic/sha256").Add(str(hashes.sha256)) if not list(aff4.FACTORY.Stat(canonical_urn)): aff4.FACTORY.Copy(fd.urn, canonical_urn) # Remove the STAT entry, it makes no sense to copy it between clients. with aff4.FACTORY.Open(canonical_urn, mode="rw", token=self.token) as new_fd: new_fd.Set(new_fd.Schema.STAT(None)) self._AddToIndex(canonical_urn, fd.urn) for hash_type, hash_digest in hashes.ListSetFields(): # Determine fingerprint type. hash_type = hash_type.name # No need to create a symlink for sha256, it's the canonical location. if hash_type == "sha256": continue hash_digest = str(hash_digest) fingerprint_type = "generic" if hash_type.startswith("pecoff_"): fingerprint_type = "pecoff" hash_type = hash_type[len("pecoff_"):] if hash_type not in self.HASH_TYPES[fingerprint_type]: continue file_store_urn = self.PATH.Add(fingerprint_type).Add( hash_type).Add(hash_digest) with aff4.FACTORY.Create(file_store_urn, aff4.AFF4Symlink, token=self.token) as symlink: symlink.Set(symlink.Schema.SYMLINK_TARGET, canonical_urn) # We do not want to be externally written here. return None
def CreateClientObject(self, vfs_fixture): """Make a new client object.""" # First remove the old fixture just in case its still there. aff4.FACTORY.Delete(self.client_id, token=self.token) # Create the fixture at a fixed time. with test_lib.FakeTime(self.age): for path, (aff4_type, attributes) in vfs_fixture: path %= self.args if data_store.AFF4Enabled(): aff4_object = aff4.FACTORY.Create(self.client_id.Add(path), aff4_type, mode="rw", token=self.token) path_info = None if data_store.RelationalDBWriteEnabled(): data_store.REL_DB.WriteClientMetadata( self.client_id.Basename(), fleetspeak_enabled=False) components = [ component for component in path.split("/") if component ] if (len(components) > 1 and components[0] == "fs" and components[1] in ["os", "tsk"]): path_info = rdf_objects.PathInfo() if components[1] == "os": path_info.path_type = rdf_objects.PathInfo.PathType.OS else: path_info.path_type = rdf_objects.PathInfo.PathType.TSK path_info.components = components[2:] if aff4_type in [ aff4_grr.VFSFile, aff4_grr.VFSMemoryFile ]: path_info.directory = False elif aff4_type == aff4_standard.VFSDirectory: path_info.directory = True else: raise ValueError("Incorrect AFF4 type: %s" % aff4_type) for attribute_name, value in iteritems(attributes): attribute = aff4.Attribute.PREDICATES[attribute_name] if isinstance(value, (bytes, Text)): # Interpolate the value value %= self.args # Is this supposed to be an RDFValue array? if issubclass(attribute.attribute_type, rdf_protodict.RDFValueArray): rdfvalue_object = attribute() for item in value: new_object = rdfvalue_object.rdf_type.FromTextFormat( utils.SmartStr(item)) rdfvalue_object.Append(new_object) # It is a text serialized protobuf. elif issubclass(attribute.attribute_type, rdf_structs.RDFProtoStruct): # Use the alternate constructor - we always write protobufs in # textual form: rdfvalue_object = attribute.attribute_type.FromTextFormat( utils.SmartStr(value)) elif issubclass(attribute.attribute_type, rdfvalue.RDFInteger): rdfvalue_object = attribute(int(value)) else: rdfvalue_object = attribute(value) if data_store.AFF4Enabled(): # If we don't already have a pathspec, try and get one from the # stat. if aff4_object.Get( aff4_object.Schema.PATHSPEC) is None: # If the attribute was a stat, it has a pathspec nested in it. # We should add that pathspec as an attribute. if attribute.attribute_type == rdf_client_fs.StatEntry: stat_object = attribute.attribute_type.FromTextFormat( utils.SmartStr(value)) if stat_object.pathspec: pathspec_attribute = aff4.Attribute( "aff4:pathspec", rdf_paths.PathSpec, "The pathspec used to retrieve " "this object from the client.", "pathspec") aff4_object.AddAttribute( pathspec_attribute, stat_object.pathspec) if attribute in ["aff4:content", "aff4:content"]: if data_store.AFF4Enabled(): # For AFF4MemoryStreams we need to call Write() instead of # directly setting the contents.. content = rdfvalue_object.AsBytes() aff4_object.Write(content) if path_info is not None: blob_id = rdf_objects.BlobID.FromBlobData(content) data_store.BLOBS.WriteBlobs({blob_id: content}) hash_id = file_store.AddFileWithUnknownHash( db.ClientPath.FromPathInfo( self.client_id.Basename(), path_info), [blob_id]) path_info.hash_entry.num_bytes = len(content) path_info.hash_entry.sha256 = hash_id.AsBytes() elif data_store.AFF4Enabled(): aff4_object.AddAttribute(attribute, rdfvalue_object) if (isinstance(rdfvalue_object, rdf_client_fs.StatEntry) and rdfvalue_object.pathspec.pathtype != "UNSET"): if data_store.RelationalDBWriteEnabled(): client_id = self.client_id.Basename() path_info = rdf_objects.PathInfo.FromStatEntry( rdfvalue_object) data_store.REL_DB.WritePathInfos( client_id, [path_info]) if data_store.AFF4Enabled(): # Populate the KB from the client attributes. if aff4_type == aff4_grr.VFSGRRClient: kb = rdf_client.KnowledgeBase() artifact.SetCoreGRRKnowledgeBaseValues(kb, aff4_object) aff4_object.Set(aff4_object.Schema.KNOWLEDGE_BASE, kb) # Make sure we do not actually close the object here - we only want to # sync back its attributes, not run any finalization code. aff4_object.Flush() if aff4_type == aff4_grr.VFSGRRClient: index = client_index.CreateClientIndex( token=self.token) index.AddClient(aff4_object) if path_info is not None: data_store.REL_DB.WritePathInfos( client_id=self.client_id.Basename(), path_infos=[path_info])
def testGetAncestorsRoot(self): path_info = rdf_objects.PathInfo(components=["foo"]) results = list(path_info.GetAncestors()) self.assertLen(results, 1) self.assertEqual(results[0].components, [])
def testUpdateFromValidatesType(self): with self.assertRaises(TypeError): rdf_objects.PathInfo(components=["usr", "local", "bin" ], ).UpdateFrom("/usr/local/bin")
def testValidateDoubleDotComponent(self): with self.assertRaisesRegex(ValueError, "Incorrect"): rdf_objects.PathInfo(components=["..", "foo", "bar"])
def testGetAncestorsEmpty(self): path_info = rdf_objects.PathInfo(components=[], directory=True) self.assertEqual(list(path_info.GetAncestors()), [])
def testValidateEmptyComponent(self): with self.assertRaisesRegex(ValueError, "Empty"): rdf_objects.PathInfo(components=["foo", "", "bar"])
def testWritePathInfosValidatesPathType(self): path = ["usr", "local"] client_id = "C.bbbbbbbbbbbbbbbb" with self.assertRaises(ValueError): self.db.WritePathInfos(client_id, [rdf_objects.PathInfo(components=path)])
def Handle(self, args, token=None): ValidateVfsPath(args.file_path) # Directories are not really "files" so they cannot be stored in the # database but they still can be queried so we need to return something. # Sometimes they contain a trailing slash so we need to take care of that. # # TODO(hanuszczak): Require VFS paths to be normalized so that trailing # slash is either forbidden or mandatory. if args.file_path.endswith("/"): args.file_path = args.file_path[:-1] if args.file_path in ["fs", "registry", "temp", "fs/os", "fs/tsk"]: api_file = ApiFile(name=args.file_path, path=args.file_path, is_directory=True, details=_GenerateApiFileDetails([])) return ApiGetFileDetailsResult(file=api_file) path_type, components = rdf_objects.ParseCategorizedPath( args.file_path) # TODO(hanuszczak): The tests passed even without support for timestamp # filtering. The test suite should be probably improved in that regard. client_id = str(args.client_id) path_infos = data_store.REL_DB.ReadPathInfoHistory( client_id, path_type, components) path_infos.reverse() if args.timestamp: path_infos = [ pi for pi in path_infos if pi.timestamp <= args.timestamp ] if not path_infos: # TODO(user): As soon as we get rid of AFF4 - raise here. At the # moment we just return a directory-like stub instead to mimic the # AFF4Volume behavior. # # raise FileNotFoundError("No file matching the path %s at timestamp %s" % # (args.file_path, args.timestamp)) pi = rdf_objects.PathInfo(path_type=path_type, components=components, directory=True) api_file = ApiFile(name=components[-1], path=args.file_path, is_directory=True, details=_GenerateApiFileDetails([pi])) return ApiGetFileDetailsResult(file=api_file) last_path_info = path_infos[0] last_collection_pi = file_store.GetLastCollectionPathInfo( db.ClientPath.FromPathInfo(client_id, last_path_info), max_timestamp=args.timestamp) file_obj = ApiFile( name=components[-1], path=rdf_objects.ToCategorizedPath(path_type, components), stat=last_path_info.stat_entry, hash=last_path_info.hash_entry, details=_GenerateApiFileDetails(path_infos), is_directory=stat.S_ISDIR(last_path_info.stat_entry.st_mode), age=last_path_info.timestamp, ) if last_collection_pi: file_obj.last_collected = last_collection_pi.timestamp file_obj.last_collected_size = last_collection_pi.hash_entry.num_bytes return ApiGetFileDetailsResult(file=file_obj)
def ListDescendentPathInfos(self, client_id, path_type, components, timestamp=None, max_depth=None, cursor=None): """Lists path info records that correspond to descendants of given path.""" path_infos = [] query = "" path = mysql_utils.ComponentsToPath(components) values = { "client_id": db_utils.ClientIDToInt(client_id), "path_type": int(path_type), "path": db_utils.EscapeWildcards(path), } query += """ SELECT path, directory, UNIX_TIMESTAMP(p.timestamp), stat_entry, UNIX_TIMESTAMP(last_stat_entry_timestamp), hash_entry, UNIX_TIMESTAMP(last_hash_entry_timestamp) FROM client_paths AS p """ if timestamp is None: query += """ LEFT JOIN client_path_stat_entries AS s ON (p.client_id = s.client_id AND p.path_type = s.path_type AND p.path_id = s.path_id AND p.last_stat_entry_timestamp = s.timestamp) LEFT JOIN client_path_hash_entries AS h ON (p.client_id = h.client_id AND p.path_type = h.path_type AND p.path_id = h.path_id AND p.last_hash_entry_timestamp = h.timestamp) """ only_explicit = False else: query += """ LEFT JOIN (SELECT sr.client_id, sr.path_type, sr.path_id, sr.stat_entry FROM client_path_stat_entries AS sr INNER JOIN (SELECT client_id, path_type, path_id, MAX(timestamp) AS max_timestamp FROM client_path_stat_entries WHERE UNIX_TIMESTAMP(timestamp) <= %(timestamp)s GROUP BY client_id, path_type, path_id) AS st ON sr.client_id = st.client_id AND sr.path_type = st.path_type AND sr.path_id = st.path_id AND sr.timestamp = st.max_timestamp) AS s ON (p.client_id = s.client_id AND p.path_type = s.path_type AND p.path_id = s.path_id) LEFT JOIN (SELECT hr.client_id, hr.path_type, hr.path_id, hr.hash_entry FROM client_path_hash_entries AS hr INNER JOIN (SELECT client_id, path_type, path_id, MAX(timestamp) AS max_timestamp FROM client_path_hash_entries WHERE UNIX_TIMESTAMP(timestamp) <= %(timestamp)s GROUP BY client_id, path_type, path_id) AS ht ON hr.client_id = ht.client_id AND hr.path_type = ht.path_type AND hr.path_id = ht.path_id AND hr.timestamp = ht.max_timestamp) AS h ON (p.client_id = h.client_id AND p.path_type = h.path_type AND p.path_id = h.path_id) """ values["timestamp"] = mysql_utils.RDFDatetimeToTimestamp(timestamp) only_explicit = True query += """ WHERE p.client_id = %(client_id)s AND p.path_type = %(path_type)s AND path LIKE concat(%(path)s, '/%%') """ if max_depth is not None: query += """ AND depth <= %(depth)s """ values["depth"] = len(components) + max_depth cursor.execute(query, values) for row in cursor.fetchall(): # pyformat: disable (path, directory, timestamp, stat_entry_bytes, last_stat_entry_timestamp, hash_entry_bytes, last_hash_entry_timestamp) = row # pyformat: enable components = mysql_utils.PathToComponents(path) if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedString( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedString( hash_entry_bytes) else: hash_entry = None datetime = mysql_utils.TimestampToRDFDatetime path_info = rdf_objects.PathInfo( path_type=path_type, components=components, timestamp=datetime(timestamp), last_stat_entry_timestamp=datetime(last_stat_entry_timestamp), last_hash_entry_timestamp=datetime(last_hash_entry_timestamp), directory=directory, stat_entry=stat_entry, hash_entry=hash_entry) path_infos.append(path_info) path_infos.sort(key=lambda _: tuple(_.components)) # For specific timestamp, we return information only about explicit paths # (paths that have associated stat or hash entry or have an ancestor that is # explicit). if not only_explicit: return path_infos explicit_path_infos = [] has_explicit_ancestor = set() # This list is sorted according to the keys component, so by traversing it # in the reverse order we make sure that we process deeper paths first. for path_info in reversed(path_infos): components = tuple(path_info.components) if (path_info.HasField("stat_entry") or path_info.HasField("hash_entry") or components in has_explicit_ancestor): explicit_path_infos.append(path_info) has_explicit_ancestor.add(components[:-1]) # Since we collected explicit paths in reverse order, we need to reverse it # again to conform to the interface. return list(reversed(explicit_path_infos))
def testUpdateFromValidatesComponents(self): with self.assertRaises(ValueError): rdf_objects.PathInfo( components=["usr", "local", "bin"]).UpdateFrom( rdf_objects.PathInfo( components=["usr", "local", "bin", "protoc"]))
def ReadPathInfosHistories(self, client_id, path_type, components_list, cursor=None): """Reads a collection of hash and stat entries for given paths.""" # MySQL does not handle well empty `IN` clauses so we guard against that. if not components_list: return {} path_infos = {components: [] for components in components_list} path_id_components = {} for components in components_list: path_id = rdf_objects.PathID.FromComponents(components) path_id_components[path_id] = components # MySQL does not support full outer joins, so we emulate them with a union. query = """ SELECT s.path_id, s.stat_entry, UNIX_TIMESTAMP(s.timestamp), h.path_id, h.hash_entry, UNIX_TIMESTAMP(h.timestamp) FROM client_path_stat_entries AS s LEFT JOIN client_path_hash_entries AS h ON s.client_id = h.client_id AND s.path_type = h.path_type AND s.path_id = h.path_id AND s.timestamp = h.timestamp WHERE s.client_id = %(client_id)s AND s.path_type = %(path_type)s AND s.path_id IN %(path_ids)s UNION SELECT s.path_id, s.stat_entry, UNIX_TIMESTAMP(s.timestamp), h.path_id, h.hash_entry, UNIX_TIMESTAMP(h.timestamp) FROM client_path_hash_entries AS h LEFT JOIN client_path_stat_entries AS s ON h.client_id = s.client_id AND h.path_type = s.path_type AND h.path_id = s.path_id AND h.timestamp = s.timestamp WHERE h.client_id = %(client_id)s AND h.path_type = %(path_type)s AND h.path_id IN %(path_ids)s """ params = { "client_id": db_utils.ClientIDToInt(client_id), "path_type": int(path_type), "path_ids": [path_id.AsBytes() for path_id in path_id_components] } cursor.execute(query, params) for row in cursor.fetchall(): # pyformat: disable (stat_entry_path_id_bytes, stat_entry_bytes, stat_entry_timestamp, hash_entry_path_id_bytes, hash_entry_bytes, hash_entry_timestamp) = row # pyformat: enable path_id_bytes = stat_entry_path_id_bytes or hash_entry_path_id_bytes path_id = rdf_objects.PathID.FromBytes(path_id_bytes) components = path_id_components[path_id] timestamp = stat_entry_timestamp or hash_entry_timestamp if stat_entry_bytes is not None: stat_entry = rdf_client_fs.StatEntry.FromSerializedString( stat_entry_bytes) else: stat_entry = None if hash_entry_bytes is not None: hash_entry = rdf_crypto.Hash.FromSerializedString( hash_entry_bytes) else: hash_entry = None path_info = rdf_objects.PathInfo( path_type=path_type, components=components, stat_entry=stat_entry, hash_entry=hash_entry, timestamp=mysql_utils.TimestampToRDFDatetime(timestamp)) path_infos[components].append(path_info) for components in components_list: path_infos[components].sort( key=lambda path_info: path_info.timestamp) return path_infos
def CreateClientObject(self, vfs_fixture): """Make a new client object.""" # Constructing a client snapshot from the legacy fixture is hard, we are # using a serialized string instead. data_store.REL_DB.WriteClientMetadata(self.client_id, fleetspeak_enabled=False) snapshot = rdf_objects.ClientSnapshot.FromSerializedBytes( binascii.unhexlify(SERIALIZED_CLIENT)) snapshot.client_id = self.client_id snapshot.knowledge_base.fqdn = "Host%s" % self.client_id # Client version number may affect flows behavior so it's important # to keep it current in order for flows tests to test the most # recent logic. snapshot.startup_info.client_info.client_version = config.CONFIG[ "Source.version_numeric"] data_store.REL_DB.WriteClientSnapshot(snapshot) client_index.ClientIndex().AddClient(snapshot) for path, (typ, attributes) in vfs_fixture: path %= self.args path_info = None components = [ component for component in path.split("/") if component ] if (len(components) > 1 and components[0] == "fs" and components[1] in ["os", "tsk", "ntfs"]): path_info = rdf_objects.PathInfo() if components[1] == "os": path_info.path_type = rdf_objects.PathInfo.PathType.OS elif components[1] == "ntfs": path_info.path_type = rdf_objects.PathInfo.PathType.NTFS else: path_info.path_type = rdf_objects.PathInfo.PathType.TSK path_info.components = components[2:] if typ == "File": path_info.directory = False elif typ == "Directory": path_info.directory = True else: raise ValueError("Incorrect object type: %s" % typ) for attribute_name in attributes: if attribute_name not in ["stat", "content"]: raise ValueError("Unknown attribute: " + attribute_name) stat = attributes.get("stat", None) if stat: stat_entry = rdf_client_fs.StatEntry.FromTextFormat(stat % self.args) if stat_entry.pathspec.pathtype != "UNSET": path_info = rdf_objects.PathInfo.FromStatEntry(stat_entry) content = attributes.get("content", None) if content: blob_id = rdf_objects.BlobID.FromBlobData(content) data_store.BLOBS.WriteBlobs({blob_id: content}) blob_ref = rdf_objects.BlobReference(offset=0, size=len(content), blob_id=blob_id) hash_id = file_store.AddFileWithUnknownHash( db.ClientPath.FromPathInfo(self.client_id, path_info), [blob_ref]) path_info.hash_entry.num_bytes = len(content) path_info.hash_entry.sha256 = hash_id.AsBytes() if path_info is not None: data_store.REL_DB.WritePathInfos(client_id=self.client_id, path_infos=[path_info])