def testVariableHuntSchedulesAllFlowsOnStart(self): client_ids = self.SetupClients(10) hunt_obj = rdf_hunt_objects.Hunt(client_rate=0) hunt_obj.args.hunt_type = hunt_obj.args.HuntType.VARIABLE for index, pair in enumerate(collection.Batch(client_ids, 2)): hunt_obj.args.variable.flow_groups.append( rdf_hunt_objects.VariableHuntFlowGroup( client_ids=pair, flow_name=compatibility.GetName(transfer.GetFile), flow_args=transfer.GetFileArgs(pathspec=rdf_paths.PathSpec( path="/tmp/evil_%d.txt" % index, pathtype=rdf_paths.PathSpec.PathType.OS, )))) data_store.REL_DB.WriteHuntObject(hunt_obj) hunt.StartHunt(hunt_obj.hunt_id) hunt_counters = data_store.REL_DB.ReadHuntCounters(hunt_obj.hunt_id) self.assertEqual(hunt_counters.num_clients, 10) all_flows = data_store.REL_DB.ReadHuntFlows(hunt_obj.hunt_id, 0, sys.maxsize) self.assertCountEqual(client_ids, [f.client_id for f in all_flows]) for index, pair in enumerate(collection.Batch(client_ids, 2)): for client_id in pair: all_flows = data_store.REL_DB.ReadAllFlowObjects(client_id) self.assertLen(all_flows, 1) self.assertEqual(all_flows[0].flow_class_name, compatibility.GetName(transfer.GetFile)) self.assertEqual(all_flows[0].args.pathspec.path, "/tmp/evil_%d.txt" % index)
def Start(self): """Retrieve all the clients for the AbstractClientStatsCollectors.""" self.stats = aff4.FACTORY.Create(self.FILESTORE_STATS_URN, aff4_stats.FilestoreStats, mode="w", token=self.token) self._CreateConsumers() hashes = aff4.FACTORY.Open(self.HASH_PATH, token=self.token).ListChildren(limit=10**8) try: for urns in collection.Batch(hashes, self.OPEN_FILES_LIMIT): for fd in aff4.FACTORY.MultiOpen(urns, mode="r", token=self.token, age=aff4.NEWEST_TIME): for consumer in self.consumers: consumer.ProcessFile(fd) self.HeartBeat() finally: for consumer in self.consumers: consumer.Save(self.stats) self.stats.Close()
def Parse(self, cmd, args, stdout, stderr, return_val, knowledge_base): """Parse the yum output.""" _ = stderr, args, knowledge_base # Unused. self.CheckReturn(cmd, return_val) # `yum list installed` output is divided into lines. First line should be # always equal to "Installed Packages". The following lines are triplets, # but if one of the triplet columns does not fit, the rest of the row is # carried over to the next line. Thus, instead of processing the output line # by line, we split it into individual items (they cannot contain any space) # and chunk them to triplets. items = stdout.decode("utf-8").split() if not (items[0] == "Installed" and items[1] == "Packages"): message = ("`yum list installed` output does not start with \"Installed " "Packages\"") raise AssertionError(message) items = items[2:] packages = [] for name_arch, version, source in collection.Batch(items, 3): name, arch = name_arch.split(".") packages.append( rdf_client.SoftwarePackage.Installed( name=name, publisher=source, version=version, architecture=arch)) if packages: yield rdf_client.SoftwarePackages(packages=packages)
def Execute(self, thread_count, urns=None): """Runs the migration with a given thread count.""" if urns is None: blob_urns = list(aff4.FACTORY.ListChildren("aff4:/blobs")) else: blob_urns = [rdfvalue.RDFURN(urn) for urn in urns] sys.stdout.write("Blobs to migrate: {}\n".format(len(blob_urns))) sys.stdout.write("Threads to use: {}\n".format(thread_count)) self._total_count = len(blob_urns) self._migrated_count = 0 self._start_time = rdfvalue.RDFDatetime.Now() batches = collection.Batch(blob_urns, _BLOB_BATCH_SIZE) self._Progress() tp = pool.ThreadPool(processes=thread_count) tp.map(self._MigrateBatch, list(batches)) self._Progress() if self._migrated_count == self._total_count: message = "\nMigration has been finished (migrated {} blobs).\n".format( self._migrated_count) sys.stdout.write(message) else: message = "Not all blobs have been migrated ({}/{})".format( self._migrated_count, self._total_count) raise AssertionError(message)
def ProcessClients(self, responses): """Does the work.""" del responses end = rdfvalue.RDFDatetime.Now() - db.CLIENT_STATS_RETENTION client_urns = export_utils.GetAllClients(token=self.token) for batch in collection.Batch(client_urns, 10000): with data_store.DB.GetMutationPool() as mutation_pool: for client_urn in batch: mutation_pool.DeleteAttributes( client_urn.Add("stats"), [u"aff4:stats"], start=0, end=end.AsMicrosecondsSinceEpoch()) self.HeartBeat() if data_store.RelationalDBEnabled(): total_deleted_count = 0 for deleted_count in data_store.REL_DB.DeleteOldClientStats( yield_after_count=_STATS_DELETION_BATCH_SIZE, retention_time=end): self.HeartBeat() total_deleted_count += deleted_count self.Log("Deleted %d ClientStats that expired before %s", total_deleted_count, end)
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return yield self.archive_generator.WriteFileHeader( "%s/%s/from_%s.yaml" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) serialized_value_bytes = _SerializeToYaml(first_value).encode("utf-8") yield self.archive_generator.WriteFileChunk(serialized_value_bytes) counter = 1 for batch in collection.Batch(exported_values, self.ROW_BATCH): counter += len(batch) buf = io.StringIO() for value in batch: buf.write("\n") buf.write(_SerializeToYaml(value)) contents = buf.getvalue() yield self.archive_generator.WriteFileChunk( contents.encode("utf-8")) yield self.archive_generator.WriteFileFooter() counts_for_original_type = self.export_counts.setdefault( original_value_type.__name__, dict()) counts_for_original_type[first_value.__class__.__name__] = counter
def AddFileWithUnknownHash(blob_ids): """Add a new file consisting of given blob IDs.""" blob_refs = [] offset = 0 sha256 = hashlib.sha256() for blob_ids_batch in collection.Batch(blob_ids, _BLOBS_READ_BATCH_SIZE): unique_ids = set(blob_ids_batch) data = data_store.BLOBS.ReadBlobs(unique_ids) for k, v in iteritems(data): if v is None: raise BlobNotFound("Couldn't find one of referenced blobs: %s" % k) for blob_id in blob_ids_batch: blob_data = data[blob_id] blob_refs.append( rdf_objects.BlobReference( offset=offset, size=len(blob_data), blob_id=blob_id, )) offset += len(blob_data) sha256.update(blob_data) hash_id = rdf_objects.SHA256HashID.FromBytes(sha256.digest()) data_store.REL_DB.WriteHashBlobReferences({hash_id: blob_refs}) return hash_id
def _GenerateConvertedValues(self, converter, grr_messages): """Generates converted values using given converter from given messages. Groups values in batches of BATCH_SIZE size and applies the converter to each batch. Args: converter: ExportConverter instance. grr_messages: An iterable (a generator is assumed) with GRRMessage values. Yields: Values generated by the converter. Raises: ValueError: if any of the GrrMessage objects doesn't have "source" set. """ for batch in collection.Batch(grr_messages, self.BATCH_SIZE): metadata_items = self._GetMetadataForClients( [gm.source for gm in batch]) batch_with_metadata = zip(metadata_items, [gm.payload for gm in batch]) for result in converter.BatchConvert(batch_with_metadata, token=self.token): yield result
def MigrateClients(self, client_urns): """Migrates entire VFS of given client list to the relational data store.""" self._start_time = rdfvalue.RDFDatetime.Now() self._client_urns_to_migrate = client_urns self._client_urns_migrated = [] self._client_urns_failed = [] to_migrate_count = len(self._client_urns_to_migrate) sys.stdout.write("Clients to migrate: {}\n".format(to_migrate_count)) batches = collection.Batch(client_urns, self.client_batch_size) tp = pool.ThreadPool(processes=self.thread_count) tp.map(self.MigrateClientBatch, list(batches)) migrated_count = len(self._client_urns_migrated) sys.stdout.write("Migrated clients: {}\n".format(migrated_count)) if to_migrate_count == migrated_count: sys.stdout.write("All clients migrated successfully!\n") else: message = "Not all clients have been migrated ({}/{})".format( migrated_count, to_migrate_count) raise RuntimeError(message)
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return yield self.archive_generator.WriteFileHeader( "%s/%s/from_%s.yaml" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) yield self.archive_generator.WriteFileChunk(_SerializeToYaml(first_value)) counter = 1 for batch in collection.Batch(exported_values, self.ROW_BATCH): counter += len(batch) # TODO(hanuszczak): YAML is supposed to be a unicode file format so we # should use `StringIO` here instead. However, because PyYAML dumps to # `bytes` instead of `unicode` we have to use `BytesIO`. It should be # investigated whether there is a way to adjust behaviour of PyYAML. buf = io.BytesIO() for value in batch: buf.write(b"\n") buf.write(_SerializeToYaml(value)) yield self.archive_generator.WriteFileChunk(buf.getvalue()) yield self.archive_generator.WriteFileFooter() counts_for_original_type = self.export_counts.setdefault( original_value_type.__name__, dict()) counts_for_original_type[first_value.__class__.__name__] = counter
def _StopLegacy(self, reason=None): super(GenericHunt, self).Stop(reason=reason) started_flows = grr_collections.RDFUrnCollection( self.started_flows_collection_urn) num_terminated_flows = 0 self.Log("Hunt stop. Terminating all the started flows.") # Delete hunt flows states. for flows_batch in collection.Batch(started_flows, self.__class__.STOP_BATCH_SIZE): with queue_manager.QueueManager(token=self.token) as manager: manager.MultiDestroyFlowStates(flows_batch) with data_store.DB.GetMutationPool() as mutation_pool: for f in flows_batch: flow.GRRFlow.MarkForTermination( f, reason="Parent hunt stopped.", mutation_pool=mutation_pool) num_terminated_flows += len(flows_batch) # Delete hunt's requests and responses to ensure no more # processing is going to occur. with queue_manager.QueueManager(token=self.token) as manager: manager.DestroyFlowStates(self.session_id) self.Log("%d flows terminated.", num_terminated_flows)
def _GetHWInfos(client_list, batch_size=10000, token=None): """Opens the given clients in batches and returns hardware information.""" # This function returns a dict mapping each client_id to a set of reported # hardware serial numbers reported by this client. hw_infos = {} logging.info("%d clients to process.", len(client_list)) c = 0 for batch in collection.Batch(client_list, batch_size): logging.info("Processing batch: %d-%d", c, c + batch_size) c += len(batch) client_objs = aff4.FACTORY.MultiOpen(batch, age=aff4.ALL_TIMES, token=token) for client in client_objs: hwi = client.GetValuesForAttribute(client.Schema.HARDWARE_INFO) hw_infos[client.urn] = set(["%s" % x.serial_number for x in hwi]) return hw_infos
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return if not isinstance(first_value, rdf_structs.RDFProtoStruct): raise ValueError("The SQLite plugin only supports export-protos") yield self.archive_generator.WriteFileHeader( "%s/%s_from_%s.sql" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) table_name = "%s.from_%s" % (first_value.__class__.__name__, original_value_type.__name__) schema = self._GetSqliteSchema(first_value.__class__) # We will buffer the sql statements into an in-memory sql database before # dumping them to the zip archive. We rely on the PySQLite library for # string escaping. db_connection = sqlite3.connect(":memory:") db_cursor = db_connection.cursor() yield self.archive_generator.WriteFileChunk( "BEGIN TRANSACTION;\n".encode("utf-8")) with db_connection: buf = io.StringIO() buf.write(u"CREATE TABLE \"%s\" (\n " % table_name) column_types = [(k, v.sqlite_type) for k, v in iteritems(schema)] buf.write(u",\n ".join( [u"\"%s\" %s" % (k, v) for k, v in column_types])) buf.write(u"\n);") db_cursor.execute(buf.getvalue()) chunk = (buf.getvalue() + "\n").encode("utf-8") yield self.archive_generator.WriteFileChunk(chunk) self._InsertValueIntoDb(table_name, schema, first_value, db_cursor) for sql in self._FlushAllRows(db_connection, table_name): yield sql counter = 1 for batch in collection.Batch(exported_values, self.ROW_BATCH): counter += len(batch) with db_connection: for value in batch: self._InsertValueIntoDb(table_name, schema, value, db_cursor) for sql in self._FlushAllRows(db_connection, table_name): yield sql db_connection.close() yield self.archive_generator.WriteFileChunk( "COMMIT;\n".encode("utf-8")) yield self.archive_generator.WriteFileFooter() counts_for_original_type = self.export_counts.setdefault( original_value_type.__name__, dict()) counts_for_original_type[first_value.__class__.__name__] = counter
def StreamFilesChunks(client_paths, max_timestamp=None): """Streams contents of given files. Args: client_paths: db.ClientPath objects describing paths to files. max_timestamp: If specified, then for every requested file will open the last collected version of the file with a timestamp equal or lower than max_timestamp. If not specified, will simply open a latest version for each file. Yields: StreamedFileChunk objects for every file read. Chunks will be returned sequentially, their order will correspond to the client_paths order. Files having no content will simply be ignored. """ path_infos_by_cp = ( data_store.REL_DB.ReadLatestPathInfosWithHashBlobReferences( client_paths, max_timestamp=max_timestamp)) hash_ids_by_cp = { cp: rdf_objects.SHA256HashID.FromBytes(pi.hash_entry.sha256.AsBytes()) for cp, pi in iteritems(path_infos_by_cp) if pi } blob_refs_by_hash_id = data_store.REL_DB.ReadHashBlobReferences( hash_ids_by_cp.values()) all_chunks = [] for cp in client_paths: try: hash_id = hash_ids_by_cp[cp] except KeyError: continue try: blob_refs = blob_refs_by_hash_id[hash_id] except KeyError: continue num_blobs = len(blob_refs) total_size = 0 for ref in blob_refs: total_size += ref.size for i, ref in enumerate(blob_refs): all_chunks.append( (cp, ref.blob_id, i, num_blobs, ref.offset, total_size)) for batch in collection.Batch(all_chunks, STREAM_CHUNKS_READ_AHEAD): blobs = data_store.BLOBS.ReadBlobs([ blob_id for cp, blob_id, i, num_blobs, offset, total_size in batch ]) for cp, blob_id, i, num_blobs, offset, total_size in batch: yield StreamedFileChunk(cp, blobs[blob_id], i, num_blobs, offset, total_size)
def Generate(self, items, token=None): """Generates archive from a given collection. Iterates the collection and generates an archive by yielding contents of every referenced AFF4Stream. Args: items: Iterable of rdf_client_fs.StatEntry objects token: User's ACLToken. Yields: Binary chunks comprising the generated archive. """ del token # unused, to be removed with AFF4 code client_ids = set() for item_batch in collection.Batch(items, self.BATCH_SIZE): client_paths = set() for item in item_batch: try: client_path = flow_export.CollectionItemToClientPath( item, self.client_id) except flow_export.ItemNotExportableError: continue if not self.predicate(client_path): self.ignored_files.add(client_path) self.processed_files.add(client_path) continue client_ids.add(client_path.client_id) client_paths.add(client_path) for chunk in file_store.StreamFilesChunks(client_paths): self.processed_files.add(chunk.client_path) for output in self._WriteFileChunk(chunk=chunk): yield output self.processed_files |= client_paths - (self.ignored_files | self.archived_files) if client_ids: for client_id, client_info in iteritems( data_store.REL_DB.MultiReadClientFullInfo(client_ids)): client = api_client.ApiClient().InitFromClientInfo(client_info) for chunk in self._GenerateClientInfo(client_id, client): yield chunk for chunk in self._GenerateDescription(): yield chunk yield self.archive_generator.Close()
def _BatchConvert(self, metadata_value_pairs): registry_pairs, file_pairs, match_pairs = self._SeparateTypes( metadata_value_pairs) for fp_batch in collection.Batch(file_pairs, self._BATCH_SIZE): if self.options.export_files_contents: pathspec_by_client_path = {} for metadata, ff_result in fp_batch: # TODO(user): Deprecate client_urn in ExportedMetadata in favor of # client_id (to be added). client_path = db.ClientPath.FromPathSpec( metadata.client_urn.Basename(), ff_result.stat_entry.pathspec) pathspec_by_client_path[ client_path] = ff_result.stat_entry.pathspec data_by_pathspec = {} for chunk in file_store.StreamFilesChunks( pathspec_by_client_path, max_size=self.MAX_CONTENT_SIZE): pathspec = pathspec_by_client_path[chunk.client_path] data_by_pathspec.setdefault(pathspec.CollapsePath(), []).append(chunk.data) for metadata, ff_result in fp_batch: result = self._CreateExportedFile(metadata, ff_result.stat_entry) # FileFinderResult has hashes in "hash_entry" attribute which is not # passed to ConvertValuesWithMetadata call. We have to process these # explicitly here. self.ParseFileHash(ff_result.hash_entry, result) if self.options.export_files_contents: try: data = data_by_pathspec[ ff_result.stat_entry.pathspec.CollapsePath()] result.content = b"".join(data)[:self.MAX_CONTENT_SIZE] result.content_sha256 = hashlib.sha256( result.content).hexdigest() except KeyError: pass yield result # Now export the registry keys for result in export.ConvertValuesWithMetadata(registry_pairs, options=self.options): yield result # Now export the grep matches. for result in export.ConvertValuesWithMetadata(match_pairs, options=self.options): yield result
def CleanVacuousVersions(clients=None, dry_run=True): """A script to remove no-op client versions. This script removes versions of a client when it is identical to the previous, in the sense that no versioned attributes were changed since the previous client version. Args: clients: A list of ClientURN, if empty cleans all clients. dry_run: whether this is a dry run """ if not clients: index = client_index.CreateClientIndex() clients = index.LookupClients(["."]) clients.sort() with data_store.DB.GetMutationPool() as pool: logging.info("checking %d clients", len(clients)) for batch in collection.Batch(clients, 10000): # TODO(amoser): This only works on datastores that use the Bigtable # scheme. client_infos = data_store.DB.MultiResolvePrefix( batch, ["aff4:", "aff4:"], data_store.DB.ALL_TIMESTAMPS) for client, type_list in client_infos: cleared = 0 kept = 0 updates = [] for a, _, ts in type_list: if ts != 0: updates.append((ts, a)) updates = sorted(updates) dirty = True for ts, a in updates: if a == "aff4:type": if dirty: kept += 1 dirty = False else: cleared += 1 if not dry_run: pool.DeleteAttributes(client, ["aff4:type"], start=ts, end=ts) if pool.Size() > 1000: pool.Flush() else: dirty = True logging.info("%s: kept %d and cleared %d", client, kept, cleared)
def Run(self): self.start = 0 self.end = int(1e6 * (time.time() - self.MAX_AGE)) client_urns = export_utils.GetAllClients(token=self.token) for batch in collection.Batch(client_urns, 10000): with data_store.DB.GetMutationPool() as mutation_pool: for client_urn in batch: mutation_pool.DeleteAttributes(client_urn.Add("stats"), [u"aff4:stats"], start=self.start, end=self.end) self.HeartBeat()
def _IterateAllClients(): """Fetches client data from the relational db.""" all_client_ids = data_store.REL_DB.ReadAllClientIDs() for batch in collection.Batch(all_client_ids, CLIENT_READ_BATCH_SIZE): client_map = data_store.REL_DB.MultiReadClientFullInfo(batch) fs_client_ids = [ cid for (cid, client) in iteritems(client_map) if client.metadata.fleetspeak_enabled ] last_contact_times = _GetLastContactFromFleetspeak(fs_client_ids) for cid, last_contact in iteritems(last_contact_times): client_map[cid].metadata.ping = last_contact for client in itervalues(client_map): yield client
def GetInput(self): """Yield client urns.""" client_list = GetAllClients(token=self.token) logging.debug("Got %d clients", len(client_list)) for client_group in collection.Batch(client_list, self.client_chunksize): for fd in aff4.FACTORY.MultiOpen(client_group, mode="r", aff4_type=aff4_grr.VFSGRRClient, token=self.token): if isinstance(fd, aff4_grr.VFSGRRClient): # Skip if older than max_age oldest_time = (time.time() - self.max_age) * 1e6 if fd.Get(aff4_grr.VFSGRRClient.SchemaCls.PING) >= oldest_time: yield fd
def Start(self): """Retrieve all the clients for the AbstractClientStatsCollectors.""" try: self.stats = {} self.BeginProcessing() processed_count = 0 if data_store.RelationalDBEnabled(): for client_info in _IterateAllClients( recency_window=self.recency_window): self.ProcessClientFullInfo(client_info) processed_count += 1 if processed_count % _CLIENT_READ_BATCH_SIZE == 0: self.Log("Processed %d clients.", processed_count) self.HeartBeat() if processed_count != 0: self.Log("Processed %d clients.", processed_count) else: root_children = aff4.FACTORY.Open( aff4.ROOT_URN, token=self.token).OpenChildren(mode="r") for batch in collection.Batch(root_children, _CLIENT_READ_BATCH_SIZE): for child in batch: if not isinstance(child, aff4_grr.VFSGRRClient): continue last_ping = child.Get(child.Schema.PING) self.ProcessLegacyClient(last_ping, child) processed_count += 1 # This flow is not dead: we don't want to run out of lease time. self.HeartBeat() self.FinishProcessing() for fd in itervalues(self.stats): fd.Close() logging.info("%s: processed %d clients.", self.__class__.__name__, processed_count) except Exception as e: # pylint: disable=broad-except logging.exception("Error while calculating stats: %s", e) raise
def _MultiStream(cls, fds): """Effectively streams data from multiple opened BlobImage objects. Args: fds: A list of opened AFF4Stream (or AFF4Stream descendants) objects. Yields: Tuples (chunk, fd, exception) where chunk is a binary blob of data and fd is an object from the fds argument. If one or more chunks are missing, exception is a MissingBlobsError object and chunk is None. _MultiStream does its best to skip the file entirely if one of its chunks is missing, but in case of very large files it's still possible to yield a truncated file. """ broken_fds = set() missing_blobs_fd_pairs = [] for chunk_fd_pairs in collection.Batch( cls._GenerateChunkIds(fds), cls.MULTI_STREAM_CHUNKS_READ_AHEAD): chunk_fds = list(map(operator.itemgetter(0), chunk_fd_pairs)) results_map = data_store.BLOBS.ReadBlobs(chunk_fds) for chunk_id, fd in chunk_fd_pairs: if chunk_id not in results_map or results_map[chunk_id] is None: missing_blobs_fd_pairs.append((chunk_id, fd)) broken_fds.add(fd) for chunk, fd in chunk_fd_pairs: if fd in broken_fds: continue yield fd, results_map[chunk], None if missing_blobs_fd_pairs: missing_blobs_by_fd = {} for chunk_id, fd in missing_blobs_fd_pairs: missing_blobs_by_fd.setdefault(fd, []).append(chunk_id) for fd, missing_blobs in iteritems(missing_blobs_by_fd): e = MissingBlobsError("%d missing blobs (multi-stream)" % len(missing_blobs), missing_chunks=missing_blobs) yield fd, None, e
def Generate( self, mappings: Iterator[flow_base.ClientPathArchiveMapping] ) -> Iterator[bytes]: """Generates archive from a given set of client path mappings. Iterates the mappings and generates an archive by yielding contents of every referenced file. Args: mappings: A set of mappings defining the archive structure. Yields: Chunks of bytes of the generated archive. """ processed_files = {} missing_files = set() for mappings_batch in collection.Batch(mappings, self.BATCH_SIZE): archive_paths_by_id = {} for mapping in mappings_batch: archive_paths_by_id[ mapping.client_path.path_id] = mapping.archive_path processed_in_batch = set() for chunk in file_store.StreamFilesChunks( [m.client_path for m in mappings_batch]): processed_in_batch.add(chunk.client_path.path_id) processed_files[ chunk.client_path.vfs_path] = archive_paths_by_id[ chunk.client_path.path_id] for output in self._WriteFileChunk(chunk, archive_paths_by_id): yield output for mapping in mappings_batch: if mapping.client_path.path_id in processed_in_batch: continue missing_files.add(mapping.client_path.vfs_path) for chunk in self._GenerateDescription(processed_files, missing_files): yield chunk yield self.archive_generator.Close()
def _IterateAllLegacyClients(token): """Fetches client data from the legacy db.""" root_children = aff4.FACTORY.Open( aff4.ROOT_URN, token=token).OpenChildren(mode="r") for batch in collection.Batch(root_children, CLIENT_READ_BATCH_SIZE): fs_client_map = {} non_fs_clients = [] for child in batch: if not isinstance(child, aff4_grr.VFSGRRClient): continue if child.Get(child.Schema.FLEETSPEAK_ENABLED): fs_client_map[child.urn.Basename()] = child else: non_fs_clients.append(child) last_contact_times = _GetLastContactFromFleetspeak(viewkeys(fs_client_map)) for client in non_fs_clients: yield client.Get(client.Schema.PING), client for cid, client in iteritems(fs_client_map): last_contact = last_contact_times.get(cid, client.Get(client.Schema.PING)) yield last_contact, client
def Convert(self, values, start_index=0, end_index=None): """Converts given collection to exported values. This method uses a threadpool to do the conversion in parallel. It blocks for up to one hour until everything is converted. Args: values: Iterable object with values to convert. start_index: Start from this index in the collection. end_index: Finish processing on the (index - 1) element of the collection. If None, work till the end of the collection. Returns: Nothing. ConvertedBatch() should handle the results. """ if not values: return try: total_batch_count = len(values) // self.batch_size except TypeError: total_batch_count = -1 pool = ThreadPool.Factory(self.threadpool_prefix, self.threadpool_size) val_iterator = itertools.islice(values, start_index, end_index) pool.Start() try: for batch_index, batch in enumerate( collection.Batch(val_iterator, self.batch_size)): logging.debug("Processing batch %d out of %d", batch_index, total_batch_count) pool.AddTask( target=self.ConvertBatch, args=(batch,), name="batch_%d" % batch_index, inline=False) finally: pool.Stop(join_timeout=3600)
def Run(self): if not fleetspeak_connector.CONN or not fleetspeak_connector.CONN.outgoing: # Nothing to do if Fleetspeak is not enabled. self.Log("Fleetspeak has not been initialized. Will do nothing.") return if not data_store.RelationalDBWriteEnabled(): raise NotImplementedError( "Cronjob does not support the legacy datastore.") age_threshold = config.CONFIG["Server.fleetspeak_last_ping_threshold"] max_last_ping = rdfvalue.RDFDatetime.Now() - age_threshold last_pings = data_store.REL_DB.ReadClientLastPings( max_last_ping=max_last_ping, fleetspeak_enabled=True) num_clients_updated = 0 batch_size = config.CONFIG["Server.fleetspeak_list_clients_batch_size"] for client_ids in collection.Batch(iterkeys(last_pings), batch_size): fs_ids = [ fleetspeak_utils.GRRIDToFleetspeakID(i) for i in client_ids ] request_start = rdfvalue.RDFDatetime.Now() fs_result = fleetspeak_connector.CONN.outgoing.ListClients( admin_pb2.ListClientsRequest(client_ids=fs_ids)) latency = rdfvalue.RDFDatetime.Now() - request_start logging.info("Fleetspeak ListClients() took %s.", latency) stats_collector_instance.Get().RecordEvent( "fleetspeak_last_ping_latency_millis", latency.milliseconds) for fs_client in fs_result.clients: grr_id = fleetspeak_utils.FleetspeakIDToGRRID( fs_client.client_id) new_last_ping = fleetspeak_utils.TSToRDFDatetime( fs_client.last_contact_time) if last_pings[grr_id] is None or last_pings[ grr_id] < new_last_ping: data_store.REL_DB.WriteClientMetadata( grr_id, last_ping=new_last_ping) num_clients_updated += 1 self.Log("Updated timestamps for %d clients.", num_clients_updated)
def _IterateAllClients(recency_window=None): """Fetches client data from the relational db. Args: recency_window: An rdfvalue.Duration specifying a window of last-ping timestamps to consider. Clients that haven't communicated with GRR servers longer than the given period will be skipped. If recency_window is None, all clients will be iterated. Yields: Batches (lists) of ClientFullInfo objects. """ if recency_window is None: min_last_ping = None else: min_last_ping = rdfvalue.RDFDatetime.Now() - recency_window client_ids = data_store.REL_DB.ReadAllClientIDs(min_last_ping=min_last_ping) for client_id_batch in collection.Batch(client_ids, CLIENT_READ_BATCH_SIZE): client_info_dict = data_store.REL_DB.MultiReadClientFullInfo( client_id_batch) yield list(itervalues(client_info_dict))
def Handle(self, args, context=None): if args.count: end = args.offset + args.count # Read <count> clients ahead in case some of them fail to open / verify. batch_size = end + args.count else: end = db.MAX_COUNT batch_size = end keywords = compatibility.ShlexSplit(args.query) api_clients = [] index = client_index.ClientIndex() # TODO(amoser): We could move the label verification into the # database making this method more efficient. Label restrictions # should be on small subsets though so this might not be worth # it. all_client_ids = set() for label in self.allow_labels: label_filter = ["label:" + label] + keywords all_client_ids.update(index.LookupClients(label_filter)) index = 0 for cid_batch in collection.Batch(sorted(all_client_ids), batch_size): client_infos = data_store.REL_DB.MultiReadClientFullInfo(cid_batch) for _, client_info in sorted(client_infos.items()): if not self._VerifyLabels(client_info.labels): continue if index >= args.offset and index < end: api_clients.append( ApiClient().InitFromClientInfo(client_info)) index += 1 if index >= end: UpdateClientsFromFleetspeak(api_clients) return ApiSearchClientsResult(items=api_clients) UpdateClientsFromFleetspeak(api_clients) return ApiSearchClientsResult(items=api_clients)
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return yield self.archive_generator.WriteFileHeader( "%s/%s/from_%s.csv" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) writer = csv.Writer() # Write the CSV header based on first value class and write # the first value itself. All other values are guaranteed # to have the same class (see ProcessSingleTypeExportedValues definition). writer.WriteRow(self._GetCSVHeader(first_value.__class__)) writer.WriteRow(self._GetCSVRow(first_value)) chunk = writer.Content().encode("utf-8") yield self.archive_generator.WriteFileChunk(chunk) # Counter starts from 1, as 1 value has already been written. counter = 1 for batch in collection.Batch(exported_values, self.ROW_BATCH): counter += len(batch) writer = csv.Writer() for value in batch: writer.WriteRow(self._GetCSVRow(value)) chunk = writer.Content().encode("utf-8") yield self.archive_generator.WriteFileChunk(chunk) yield self.archive_generator.WriteFileFooter() self.export_counts.setdefault( original_value_type.__name__, dict())[first_value.__class__.__name__] = counter
def Execute(self, thread_count): """Runs the migration procedure. Args: thread_count: A number of threads to execute the migration with. Raises: AssertionError: If not all clients have been migrated. ValueError: If the relational database backend is not available. """ if not data_store.RelationalDBWriteEnabled(): raise ValueError("No relational database available.") sys.stdout.write("Collecting clients...\n") client_urns = _GetClientUrns() sys.stdout.write("Clients to migrate: {}\n".format(len(client_urns))) sys.stdout.write("Threads to use: {}\n".format(thread_count)) self._total_count = len(client_urns) self._migrated_count = 0 self._start_time = rdfvalue.RDFDatetime.Now() batches = collection.Batch(client_urns, _CLIENT_BATCH_SIZE) self._Progress() tp = pool.ThreadPool(processes=thread_count) tp.map(self._MigrateBatch, list(batches)) self._Progress() if self._migrated_count == self._total_count: message = "\nMigration has been finished (migrated {} clients).\n".format( self._migrated_count) sys.stdout.write(message) else: message = "Not all clients have been migrated ({}/{})".format( self._migrated_count, self._total_count) raise AssertionError(message)