def _GenerateConvertedValues(self, converter, grr_messages): """Generates converted values using given converter from given messages. Groups values in batches of BATCH_SIZE size and applies the converter to each batch. Args: converter: ExportConverter instance. grr_messages: An iterable (a generator is assumed) with GRRMessage values. Yields: Values generated by the converter. Raises: ValueError: if any of the GrrMessage objects doesn't have "source" set. """ for batch in utils.Grouper(grr_messages, self.BATCH_SIZE): batch_with_metadata = [] for grr_message in batch: if not grr_message.source: raise ValueError("GrrMessage's source can't be empty") metadata = self.GetDefaultMetadata() metadata.client_urn = grr_message.source batch_with_metadata.append((metadata, grr_message.payload)) for result in converter.BatchConvert(batch_with_metadata, token=self.token): yield result
def _GetHWInfos(client_list, batch_size=10000, token=None): """Opens the given clients in batches and returns hardware information.""" # This function returns a dict mapping each client_id to a set of reported # hardware serial numbers reported by this client. hw_infos = {} logging.info("%d clients to process.", len(client_list)) c = 0 for batch in utils.Grouper(client_list, batch_size): logging.info("Processing batch: %d-%d", c, c + batch_size) c += len(batch) client_objs = aff4.FACTORY.MultiOpen(batch, age=aff4.ALL_TIMES, token=token) for client in client_objs: hwi = client.GetValuesForAttribute(client.Schema.HARDWARE_INFO) hw_infos[client.urn] = set(["%s" % x.serial_number for x in hwi]) return hw_infos
def ProcessHuntResults(self, results, freeze_timestamp): plugins_exceptions = {} hunt_urn = results.Get(results.Schema.RESULTS_SOURCE) metadata_urn = hunt_urn.Add("ResultsMetadata") batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE batches = utils.Grouper(results.GenerateUncompactedItems( max_reversed_results=self.MAX_REVERSED_RESULTS, timestamp=freeze_timestamp), batch_size) with aff4.FACTORY.Open( metadata_urn, mode="rw", token=self.token) as metadata_obj: output_plugins = metadata_obj.Get(metadata_obj.Schema.OUTPUT_PLUGINS) num_processed = int(metadata_obj.Get( metadata_obj.Schema.NUM_PROCESSED_RESULTS)) used_plugins = [] for batch_index, batch in enumerate(batches): batch = list(batch) num_processed += len(batch) if not used_plugins: for _, (plugin_def, state) in output_plugins.data.iteritems(): # TODO(user): Remove as soon as migration to new-style # output plugins is completed. if not hasattr(plugin_def, "GetPluginForState"): logging.error("Invalid plugin_def: %s", plugin_def) continue used_plugins.append((plugin_def, plugin_def.GetPluginForState(state))) batch_exceptions = self.ApplyPluginsToBatch(hunt_urn, used_plugins, batch, batch_index) if batch_exceptions: for key, value in batch_exceptions.items(): plugins_exceptions.setdefault(key, []).append(value) self.HeartBeat() # If this flow is working for more than max_running_time - stop # processing. if self.CheckIfRunningTooLong(): self.Log("Running for too long, skipping rest of batches for %s", hunt_urn) break if not used_plugins: logging.debug("Got notification, but no results were processed for %s.", hunt_urn) flush_exceptions = self.FlushPlugins(hunt_urn, used_plugins) plugins_exceptions.update(flush_exceptions) metadata_obj.Set(metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins)) metadata_obj.Set(metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) return plugins_exceptions
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return yield self.archive_generator.WriteFileHeader( "%s/%s/from_%s.yaml" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) yield self.archive_generator.WriteFileChunk(_SerializeToYaml(first_value)) counter = 1 for batch in utils.Grouper(exported_values, self.ROW_BATCH): counter += len(batch) buf = cStringIO.StringIO() for value in batch: buf.write("\n") buf.write(_SerializeToYaml(value)) yield self.archive_generator.WriteFileChunk(buf.getvalue()) yield self.archive_generator.WriteFileFooter() counts_for_original_type = self.export_counts.setdefault( original_value_type.__name__, dict()) counts_for_original_type[first_value.__class__.__name__] = counter
def Stop(self, reason=None): super(GenericHunt, self).Stop(reason=reason) started_flows = grr_collections.RDFUrnCollection( self.started_flows_collection_urn) num_terminated_flows = 0 self.Log("Hunt stop. Terminating all the started flows.") # Delete hunt flows states. for flows_batch in utils.Grouper(started_flows, self.__class__.STOP_BATCH_SIZE): with queue_manager.QueueManager(token=self.token) as manager: manager.MultiDestroyFlowStates(flows_batch) with data_store.DB.GetMutationPool() as mutation_pool: for f in flows_batch: flow.GRRFlow.MarkForTermination( f, reason="Parent hunt stopped.", mutation_pool=mutation_pool) num_terminated_flows += len(flows_batch) # Delete hunt's requests and responses to ensure no more # processing is going to occur. with queue_manager.QueueManager(token=self.token) as manager: manager.DestroyFlowStates(self.session_id) self.Log("%d flows terminated.", num_terminated_flows)
def DownloadCollectionFiles(self, collection, output_writer, prefix): """Download all files from the collection and deduplicate along the way.""" hashes = set() for fd_urn_batch in utils.Grouper(self.ResultsToUrns(collection), self.BATCH_SIZE): self.HeartBeat() for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=self.token): self.state.total_files += 1 # Any file-like object with data in AFF4 should inherit AFF4Stream. if isinstance(fd, aff4.AFF4Stream): archive_path = os.path.join(prefix, *fd.urn.Split()) self.state.archived_files += 1 sha256_hash = fd.Get(fd.Schema.HASH, rdf_crypto.Hash()).sha256 content_path = os.path.join(prefix, "hashes", str(sha256_hash)) if sha256_hash not in hashes: # Make sure size of the original file is passed. It's required # when output_writer is StreamingTarWriter. st = os.stat_result((0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0)) output_writer.WriteFromFD(fd, content_path, st=st) hashes.add(sha256_hash) self.Log("Written contents: " + content_path) up_prefix = "../" * len(fd.urn.Split()) output_writer.WriteSymlink(up_prefix + content_path, archive_path) self.Log("Written symlink %s -> %s", archive_path, up_prefix + content_path)
def Start(self): inactive_client_ttl = config_lib.CONFIG["DataRetention.inactive_client_ttl"] if not inactive_client_ttl: self.Log("TTL not set - nothing to do...") return exception_label = config_lib.CONFIG[ "DataRetention.inactive_client_ttl_exception_label"] index = aff4.FACTORY.Create(client_index.MAIN_INDEX, aff4_type=client_index.ClientIndex, mode="rw", token=self.token) client_urns = index.LookupClients(["."]) deadline = rdfvalue.RDFDatetime().Now() - inactive_client_ttl for client_group in utils.Grouper(client_urns, 1000): inactive_client_urns = [] for client in aff4.FACTORY.MultiOpen(client_group, mode="r", aff4_type=aff4_grr.VFSGRRClient, token=self.token): if exception_label in client.GetLabelsNames(): continue if client.Get(client.Schema.LAST) < deadline: inactive_client_urns.append(client.urn) aff4.FACTORY.MultiDelete(inactive_client_urns, token=self.token) self.HeartBeat()
def Start(self): tmp_ttl = config_lib.CONFIG["DataRetention.tmp_ttl"] if not tmp_ttl: self.Log("TTL not set - nothing to do...") return exception_label = config_lib.CONFIG[ "DataRetention.tmp_ttl_exception_label"] tmp_root = aff4.FACTORY.Open("aff4:/tmp", mode="r", token=self.token) tmp_urns = list(tmp_root.ListChildren()) deadline = rdfvalue.RDFDatetime().Now() - tmp_ttl for tmp_group in utils.Grouper(tmp_urns, 10000): expired_tmp_urns = [] for tmp_obj in aff4.FACTORY.MultiOpen(tmp_group, mode="r", token=self.token): if exception_label in tmp_obj.GetLabelsNames(): continue if tmp_obj.Get(tmp_obj.Schema.LAST) < deadline: expired_tmp_urns.append(tmp_obj.urn) aff4.FACTORY.MultiDelete(expired_tmp_urns, token=self.token) self.HeartBeat()
def _GenerateConvertedValues(self, converter, grr_messages): """Generates converted values using given converter from given messages. Groups values in batches of BATCH_SIZE size and applies the converter to each batch. Args: converter: ExportConverter instance. grr_messages: An iterable (a generator is assumed) with GRRMessage values. Yields: Values generated by the converter. Raises: ValueError: if any of the GrrMessage objects doesn't have "source" set. """ for batch in utils.Grouper(grr_messages, self.BATCH_SIZE): metadata_items = self._GetMetadataForClients( [gm.source for gm in batch]) batch_with_metadata = zip(metadata_items, [gm.payload for gm in batch]) for result in converter.BatchConvert(batch_with_metadata, token=self.token): yield result
def Start(self): """Retrieve all the clients for the AbstractClientStatsCollectors.""" self.stats = aff4.FACTORY.Create( self.FILESTORE_STATS_URN, aff4_stats.FilestoreStats, mode="w", token=self.token) self._CreateConsumers() hashes = aff4.FACTORY.Open( self.HASH_PATH, token=self.token).ListChildren(limit=10**8) try: for urns in utils.Grouper(hashes, self.OPEN_FILES_LIMIT): for fd in aff4.FACTORY.MultiOpen( urns, mode="r", token=self.token, age=aff4.NEWEST_TIME): for consumer in self.consumers: consumer.ProcessFile(fd) self.HeartBeat() finally: for consumer in self.consumers: consumer.Save(self.stats) self.stats.Close()
def ProcessOneHunt(self, exceptions_by_hunt): """Reads results for one hunt and process them.""" hunt_results_urn, results = ( hunts_results.HuntResultQueue.ClaimNotificationsForCollection( start_time=self.args.start_processing_time, token=self.token, lease_time=self.lifetime)) logging.debug("Found %d results for hunt %s", len(results), hunt_results_urn) if not results: return 0 hunt_urn = rdfvalue.RDFURN(hunt_results_urn.Dirname()) batch_size = self.args.batch_size or self.DEFAULT_BATCH_SIZE metadata_urn = hunt_urn.Add("ResultsMetadata") exceptions_by_plugin = {} num_processed_for_hunt = 0 collection_obj = implementation.GRRHunt.ResultCollectionForHID( hunt_urn, token=self.token) try: with aff4.FACTORY.OpenWithLock( metadata_urn, lease_time=600, token=self.token) as metadata_obj: all_plugins, used_plugins = self.LoadPlugins(metadata_obj) num_processed = int( metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS)) for batch in utils.Grouper(results, batch_size): results = list( collection_obj.MultiResolve([(ts, suffix) for (_, ts, suffix) in batch])) self.RunPlugins(hunt_urn, used_plugins, results, exceptions_by_plugin) hunts_results.HuntResultQueue.DeleteNotifications( [record_id for (record_id, _, _) in batch], token=self.token) num_processed += len(batch) num_processed_for_hunt += len(batch) self.HeartBeat() metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) metadata_obj.UpdateLease(600) if self.CheckIfRunningTooLong(): logging.warning("Run too long, stopping.") break metadata_obj.Set(metadata_obj.Schema.OUTPUT_PLUGINS(all_plugins)) metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) except aff4.LockError: logging.warn("ProcessHuntResultCollectionsCronFlow: " "Could not get lock on hunt metadata %s.", metadata_urn) return 0 if exceptions_by_plugin: for plugin, exceptions in exceptions_by_plugin.items(): exceptions_by_hunt.setdefault(hunt_urn, {}).setdefault( plugin, []).extend(exceptions) logging.debug("Processed %d results.", num_processed_for_hunt) return len(results)
def Convert(self, metadata, collection, token=None): if not collection: return for batch in utils.Grouper(collection, self.BATCH_SIZE): converted_batch = ConvertValues(metadata, batch, token=token, options=self.options) for v in converted_batch: yield v
def GetClientStates(self, client_list, client_chunk=50): """Take in a client list and return dicts with their age and hostname.""" for client_group in utils.Grouper(client_list, client_chunk): for fd in aff4.FACTORY.MultiOpen(client_group, mode="r", aff4_type="VFSGRRClient", token=self.token): result = {} result["age"] = fd.Get(fd.Schema.PING) result["hostname"] = fd.Get(fd.Schema.HOSTNAME) yield (fd.urn, result)
def Generate(self, collection, token=None): """Generates archive from a given collection. Iterates the collection and generates an archive by yielding contents of every referenced AFF4Stream. Args: collection: Iterable with items that point to aff4 paths. token: User's ACLToken. Yields: Binary chunks comprising the generated archive. """ hashes = set() for fd_urn_batch in utils.Grouper(self._ItemsToUrns(collection), self.BATCH_SIZE): for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=token): self.total_files += 1 # Any file-like object with data in AFF4 should inherit AFF4Stream. if isinstance(fd, aff4.AFF4Stream): archive_path = os.path.join(self.prefix, *fd.urn.Split()) sha256_hash = fd.Get(fd.Schema.HASH, rdf_crypto.Hash()).sha256 if not sha256_hash: continue self.archived_files += 1 content_path = os.path.join(self.prefix, "hashes", str(sha256_hash)) if sha256_hash not in hashes: # Make sure size of the original file is passed. It's required # when output_writer is StreamingTarWriter. st = os.stat_result( (0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0)) try: for chunk in self.archive_generator.WriteFromFD( fd, content_path, st=st): yield chunk hashes.add(sha256_hash) except Exception: # pylint: disable=broad-except self.failed_files += 1 continue up_prefix = "../" * len(fd.urn.Split()) yield self.archive_generator.WriteSymlink( up_prefix + content_path, archive_path) for chunk in self._WriteDescription(): yield chunk yield self.archive_generator.Close()
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return if not isinstance(first_value, rdf_structs.RDFProtoStruct): raise ValueError("The SQLite plugin only supports export-protos") yield self.archive_generator.WriteFileHeader( "%s/%s_from_%s.sql" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) table_name = "%s.from_%s" % (first_value.__class__.__name__, original_value_type.__name__) schema = self._GetSqliteSchema(first_value.__class__) # We will buffer the sql statements into an in-memory sql database before # dumping them to the zip archive. We rely on the PySQLite library for # string escaping. db_connection = sqlite3.connect(":memory:") db_cursor = db_connection.cursor() yield self.archive_generator.WriteFileChunk("BEGIN TRANSACTION;\n") with db_connection: buf = cStringIO.StringIO() buf.write("CREATE TABLE \"%s\" (\n " % table_name) column_types = [(k, v.sqlite_type) for k, v in schema.items()] buf.write(",\n ".join( ["\"%s\" %s" % (k, v) for k, v in column_types])) buf.write("\n);") db_cursor.execute(buf.getvalue()) yield self.archive_generator.WriteFileChunk(buf.getvalue() + "\n") self._InsertValueIntoDb(table_name, schema, first_value, db_cursor) for sql in self._FlushAllRows(db_connection, table_name): yield sql counter = 1 for batch in utils.Grouper(exported_values, self.ROW_BATCH): counter += len(batch) with db_connection: for value in batch: self._InsertValueIntoDb(table_name, schema, value, db_cursor) for sql in self._FlushAllRows(db_connection, table_name): yield sql db_connection.close() yield self.archive_generator.WriteFileChunk("COMMIT;\n") yield self.archive_generator.WriteFileFooter() counts_for_original_type = self.export_counts.setdefault( original_value_type.__name__, dict()) counts_for_original_type[first_value.__class__.__name__] = counter
def ProcessOneHunt(self, exceptions_by_hunt): """Reads results for one hunt and process them.""" hunt_results_urn, results = ( hunts_results.HuntResultQueue.ClaimNotificationsForCollection( token=self.token, lease_time=self.lifetime)) if not results: return 0 hunt_urn = rdfvalue.RDFURN(hunt_results_urn.Dirname()) batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE metadata_urn = hunt_urn.Add("ResultsMetadata") exceptions_by_plugin = {} with aff4.FACTORY.OpenWithLock(hunt_results_urn, aff4_type="HuntResultCollection", lease_time=600, token=self.token) as collection_obj: with aff4.FACTORY.OpenWithLock(metadata_urn, lease_time=600, token=self.token) as metadata_obj: all_plugins, used_plugins = self.LoadPlugins(metadata_obj) num_processed = int( metadata_obj.Get( metadata_obj.Schema.NUM_PROCESSED_RESULTS)) for batch in utils.Grouper(results, batch_size): results = list( collection_obj.MultiResolve([ (ts, suffix) for (_, ts, suffix) in batch ])) self.RunPlugins(hunt_urn, used_plugins, results, exceptions_by_plugin) hunts_results.HuntResultQueue.DeleteNotifications( [record_id for (record_id, _, _) in batch], token=self.token) num_processed += len(batch) self.HeartBeat() collection_obj.UpdateLease(600) metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS( num_processed)) metadata_obj.UpdateLease(600) if self.CheckIfRunningTooLong(): break self.FlushPlugins(hunt_urn, used_plugins, exceptions_by_plugin) metadata_obj.Set( metadata_obj.Schema.OUTPUT_PLUGINS(all_plugins)) metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) if exceptions_by_plugin: for plugin, exceptions in exceptions_by_plugin.items(): exceptions_by_hunt.setdefault(hunt_urn, {}).setdefault( plugin, []).extend(exceptions) return len(results)
def ProcessHuntResults(self, results): plugins_exceptions = None hunt_urn = results.Get(results.Schema.RESULTS_SOURCE) metadata_urn = hunt_urn.Add("ResultsMetadata") batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE batches = utils.Grouper( results.GenerateUncompactedItems( max_reversed_results=self.MAX_REVERSED_RESULTS), batch_size) with aff4.FACTORY.Open(metadata_urn, mode="rw", token=self.token) as metadata_obj: output_plugins = metadata_obj.Get( metadata_obj.Schema.OUTPUT_PLUGINS) num_processed = int( metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS)) used_plugins = {} for batch_index, batch in enumerate(batches): batch = list(batch) num_processed += len(batch) if not used_plugins: for plugin_name, ( plugin_def, state) in output_plugins.data.iteritems(): used_plugins[ plugin_name] = plugin_def.GetPluginForState(state) plugins_exceptions = self.ApplyPluginsToBatch( hunt_urn, used_plugins, batch, batch_index) self.HeartBeat() # If this flow is working for more than max_running_time - stop # processing. if self.CheckIfRunningTooLong(): self.Log( "Running for too long, skipping rest of batches for %s", hunt_urn) break flush_exceptions = self.FlushPlugins(hunt_urn, used_plugins) plugins_exceptions.update(flush_exceptions) metadata_obj.Set( metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins)) metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) return plugins_exceptions
def GetInput(self): """Yield client urns.""" client_list = GetAllClients(token=self.token) logging.debug("Got %d clients", len(client_list)) for client_group in utils.Grouper(client_list, self.client_chunksize): for fd in aff4.FACTORY.MultiOpen(client_group, mode="r", aff4_type="VFSGRRClient", token=self.token): if isinstance(fd, aff4_grr.VFSGRRClient): # Skip if older than max_age oldest_time = (time.time() - self.max_age) * 1e6 if fd.Get(aff4.VFSGRRClient.SchemaCls.PING) >= oldest_time: yield fd
def CleanVacuousVersions(clients=None, dry_run=True): """A script to remove no-op client versions. This script removes versions of a client when it is identical to the previous, in the sense that no versioned attributes were changed since the previous client version. Args: clients: A list of ClientURN, if empty cleans all clients. dry_run: whether this is a dry run """ if not clients: index = client_index.CreateClientIndex() clients = index.LookupClients(["."]) clients.sort() with data_store.DB.GetMutationPool() as pool: logging.info("checking %d clients", len(clients)) for batch in utils.Grouper(clients, 10000): # TODO(amoser): This only works on datastores that use the Bigtable # scheme. client_infos = data_store.DB.MultiResolvePrefix( batch, ["aff4:", "aff4:"], data_store.DB.ALL_TIMESTAMPS) for client, type_list in client_infos: cleared = 0 kept = 0 updates = [] for a, _, ts in type_list: if ts != 0: updates.append((ts, a)) updates = sorted(updates) dirty = True for ts, a in updates: if a == "aff4:type": if dirty: kept += 1 dirty = False else: cleared += 1 if not dry_run: pool.DeleteAttributes(client, ["aff4:type"], start=ts, end=ts) if pool.Size() > 1000: pool.Flush() else: dirty = True logging.info("%s: kept %d and cleared %d", client, kept, cleared)
def _ProcessValuesWithOutputPlugin(self, values, output_plugin, args): """Processes given values with given output plugin.""" checkpoints = utils.Grouper(values, args.checkpoint_every) for index, checkpoint in enumerate(checkpoints): logging.info("Starting checkpoint %d.", index) batch_converter = HuntOutputPluginBatchConverter( batch_size=args.batch, threadpool_size=args.threads, output_plugin=output_plugin) batch_converter.Convert(checkpoint) logging.info("Checkpointing (checkpoint %d)...", index) output_plugin.Flush() logging.info("Checkpoint %d done.", index)
def IterateAllClientSnapshots(self, batch_size=50000): """Iterates over all available clients and yields client snapshot objects. Args: batch_size: Always reads <batch_size> snapshots at a time. Yields: An rdfvalues.objects.ClientSnapshot object for each client in the db. """ all_client_ids = self.ReadAllClientIDs() for batch in utils.Grouper(all_client_ids, batch_size): res = self.MultiReadClientSnapshot(batch) for snapshot in res.values(): if snapshot: yield snapshot
def ProcessClients(self, unused_responses): """Does the work.""" self.start = 0 self.end = int(1e6 * (time.time() - self.MAX_AGE)) client_urns = export_utils.GetAllClients(token=self.token) for batch in utils.Grouper(client_urns, 10000): with data_store.DB.GetMutationPool() as mutation_pool: for client_urn in batch: mutation_pool.DeleteAttributes(client_urn.Add("stats"), [u"aff4:stats"], start=self.start, end=self.end) self.HeartBeat()
def IterateAllClientsFullInfo(self, batch_size=50000, min_last_ping=None): """Iterates over all available clients and yields full info protobufs. Args: batch_size: Always reads <batch_size> client full infos at a time. min_last_ping: If not None, only the clients with last ping time bigger than min_last_ping will be returned. Yields: An rdfvalues.objects.ClientFullInfo object for each client in the db. """ all_client_ids = self.ReadAllClientIDs() for batch in utils.Grouper(all_client_ids, batch_size): res = self.MultiReadClientFullInfo(batch, min_last_ping=min_last_ping) for full_info in res.values(): yield full_info
def _MultiStream(cls, fds): """Effectively streams data from multiple opened BlobImage objects. Args: fds: A list of opened AFF4Stream (or AFF4Stream descendants) objects. Yields: Tuples (chunk, fd, exception) where chunk is a binary blob of data and fd is an object from the fds argument. If one or more chunks are missing, exception is a MissingBlobsError object and chunk is None. _MultiStream does its best to skip the file entirely if one of its chunks is missing, but in case of very large files it's still possible to yield a truncated file. """ broken_fds = set() missing_blobs_fd_pairs = [] for chunk_fd_pairs in utils.Grouper( cls._GenerateChunkIds(fds), cls.MULTI_STREAM_CHUNKS_READ_AHEAD): results_map = data_store.DB.ReadBlobs(dict(chunk_fd_pairs).keys(), token=fds[0].token) for chunk_id, fd in chunk_fd_pairs: if chunk_id not in results_map or results_map[chunk_id] is None: missing_blobs_fd_pairs.append((chunk_id, fd)) broken_fds.add(fd) for chunk, fd in chunk_fd_pairs: if fd in broken_fds: continue yield fd, results_map[chunk], None if missing_blobs_fd_pairs: missing_blobs_by_fd = {} for chunk_id, fd in missing_blobs_fd_pairs: missing_blobs_by_fd.setdefault(fd, []).append(chunk_id) for fd, missing_blobs in missing_blobs_by_fd.iteritems(): e = MissingBlobsError("%d missing blobs (multi-stream)" % len(missing_blobs), missing_chunks=missing_blobs) yield fd, None, e
class SuspendableListDirectory(actions.SuspendableAction): """Lists a directory as a suspendable client action.""" in_rdfvalue = rdf_client.ListDirRequest out_rdfvalues = [rdf_client.StatEntry] def Iterate(self): try: fd = vfs.VFSOpen(self.request.pathspec, progress_callback=self.Progress) except (IOError, OSError), e: self.SetStatus(rdf_flows.GrrStatus.ReturnedStatus.IOERROR, e) return length = self.request.iterator.number for group in utils.Grouper(fd.ListFiles(), length): for response in group: self.SendReply(response) self.Suspend()
def Convert(self, values, start_index=0, end_index=None): """Converts given collection to exported values. This method uses a threadpool to do the conversion in parallel. It blocks until everything is converted. Args: values: Iterable object with values to convert. start_index: Start from this index in the collection. end_index: Finish processing on the (index - 1) element of the collection. If None, work till the end of the collection. Returns: Nothing. ConvertedBatch() should handle the results. """ if not values: return try: total_batch_count = len(values) / self.batch_size except TypeError: total_batch_count = -1 pool = ThreadPool.Factory(self.threadpool_prefix, self.threadpool_size) val_iterator = itertools.islice(values, start_index, end_index) pool.Start() try: for batch_index, batch in enumerate( utils.Grouper(val_iterator, self.batch_size)): logging.debug("Processing batch %d out of %d", batch_index, total_batch_count) pool.AddTask( target=self.ConvertBatch, args=(batch,), name="batch_%d" % batch_index, inline=False) finally: pool.Stop()
def Execute(self, thread_count): """Runs the migration procedure. Args: thread_count: A number of threads to execute the migration with. Raises: AssertionError: If not all clients have been migrated. ValueError: If the relational database backend is not available. """ if not data_store.RelationalDBWriteEnabled(): raise ValueError("No relational database available.") sys.stdout.write("Collecting clients...\n") client_urns = _GetClientUrns() sys.stdout.write("Clients to migrate: {}\n".format(len(client_urns))) sys.stdout.write("Threads to use: {}\n".format(thread_count)) self._total_count = len(client_urns) self._migrated_count = 0 self._start_time = rdfvalue.RDFDatetime.Now() batches = utils.Grouper(client_urns, _CLIENT_BATCH_SIZE) self._Progress() tp = pool.ThreadPool(processes=thread_count) tp.map(self._MigrateBatch, list(batches)) self._Progress() if self._migrated_count == self._total_count: message = "\nMigration has been finished (migrated {} clients).\n".format( self._migrated_count) sys.stdout.write(message) else: message = "Not all clients have been migrated ({}/{})".format( self._migrated_count, self._total_count) raise AssertionError(message)
def ProcessSingleTypeExportedValues(self, original_value_type, exported_values): first_value = next(exported_values, None) if not first_value: return yield self.archive_generator.WriteFileHeader( "%s/%s/from_%s.csv" % (self.path_prefix, first_value.__class__.__name__, original_value_type.__name__)) buf = cStringIO.StringIO() writer = csv.writer(buf) # Write the CSV header based on first value class and write # the first value itself. All other values are guaranteed # to have the same class (see ProcessSingleTypeExportedValues definition). writer.writerow(self._GetCSVHeader(first_value.__class__)) writer.writerow(self._GetCSVRow(first_value)) yield self.archive_generator.WriteFileChunk(buf.getvalue()) # Counter starts from 1, as 1 value has already been written. counter = 1 for batch in utils.Grouper(exported_values, self.ROW_BATCH): counter += len(batch) buf = cStringIO.StringIO() writer = csv.writer(buf) for value in batch: writer.writerow(self._GetCSVRow(value)) yield self.archive_generator.WriteFileChunk(buf.getvalue()) yield self.archive_generator.WriteFileFooter() self.export_counts.setdefault( original_value_type.__name__, dict())[first_value.__class__.__name__] = counter
def DeprecatedProcessHunt(self, session_id): metadata_urn = session_id.Add("ResultsMetadata") last_exception = None with aff4.FACTORY.Open(metadata_urn, mode="rw", token=self.token) as metadata_obj: output_plugins = metadata_obj.Get( metadata_obj.Schema.OUTPUT_PLUGINS) num_processed = int( metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS)) raw_offset = int( metadata_obj.Get( metadata_obj.Schema.DEPRECATED_COLLECTION_RAW_OFFSET)) results = aff4.FACTORY.Open(session_id.Add("Results"), mode="r", token=self.token) batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE batches = utils.Grouper(results.GenerateItems(offset=raw_offset), batch_size) used_plugins = {} for batch_index, batch in enumerate(batches): if not used_plugins: for plugin_name, ( plugin_def, state) in output_plugins.data.iteritems(): used_plugins[ plugin_name] = plugin_def.GetPluginForState(state) # If this flow is working for more than max_running_time - stop # processing. if self.state.args.max_running_time: elapsed = ( rdfvalue.RDFDatetime().Now().AsSecondsFromEpoch() - self.start_time.AsSecondsFromEpoch()) if elapsed > self.state.args.max_running_time: self.Log( "Running for too long, skipping rest of batches for %s.", session_id) break batch = list(batch) num_processed += len(batch) for plugin_name, plugin in used_plugins.iteritems(): logging.debug("Processing hunt %s with %s, batch %d", session_id, plugin_name, batch_index) try: plugin.ProcessResponses(batch) except Exception as e: # pylint: disable=broad-except logging.exception( "Error processing hunt results: hunt %s, " "plugin %s, batch %d", session_id, plugin_name, batch_index) self.Log("Error processing hunt results (hunt %s, " "plugin %s, batch %d): %s" % (session_id, plugin_name, batch_index, e)) last_exception = e self.HeartBeat() for plugin in used_plugins.itervalues(): try: plugin.Flush() except Exception as e: # pylint: disable=broad-except logging.exception( "Error flushing hunt results: hunt %s, " "plugin %s", session_id, str(plugin)) self.Log("Error processing hunt results (hunt %s, " "plugin %s): %s" % (session_id, str(plugin), e)) last_exception = e metadata_obj.Set( metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins)) metadata_obj.Set( metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed)) metadata_obj.Set( metadata_obj.Schema.DEPRECATED_COLLECTION_RAW_OFFSET( results.deprecated_current_offset)) # TODO(user): throw proper exception which will contain all the # exceptions that were raised while processing this hunt. if last_exception: raise last_exception # pylint: disable=raising-bad-type
def Generate(self, collection, token=None): """Generates archive from a given collection. Iterates the collection and generates an archive by yielding contents of every referenced AFF4Stream. Args: collection: Iterable with items that point to aff4 paths. token: User's ACLToken. Yields: Binary chunks comprising the generated archive. """ hashes = set() for fd_urn_batch in utils.Grouper(self._ItemsToUrns(collection), self.BATCH_SIZE): fds_to_write = {} for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=token): self.total_files += 1 if not self.predicate(fd): self.ignored_files.append(utils.SmartUnicode(fd.urn)) continue # Any file-like object with data in AFF4 should inherit AFF4Stream. if isinstance(fd, aff4.AFF4Stream): archive_path = os.path.join(self.prefix, *fd.urn.Split()) sha256_hash = fd.Get(fd.Schema.HASH, rdf_crypto.Hash()).sha256 if not sha256_hash: continue self.archived_files += 1 content_path = os.path.join(self.prefix, "hashes", str(sha256_hash)) if sha256_hash not in hashes: # Make sure size of the original file is passed. It's required # when output_writer is StreamingTarWriter. st = os.stat_result( (0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0)) fds_to_write[fd] = (content_path, st) hashes.add(sha256_hash) up_prefix = "../" * len(fd.urn.Split()) yield self.archive_generator.WriteSymlink( up_prefix + content_path, archive_path) if fds_to_write: prev_fd = None for fd, chunk, exception in aff4.AFF4Stream.MultiStream( fds_to_write): if exception: logging.exception(exception) self.archived_files -= 1 self.failed_files.append(utils.SmartUnicode(fd.urn)) continue if prev_fd != fd: if prev_fd: yield self.archive_generator.WriteFileFooter() prev_fd = fd content_path, st = fds_to_write[fd] yield self.archive_generator.WriteFileHeader( content_path, st=st) yield self.archive_generator.WriteFileChunk(chunk) if self.archive_generator.is_file_write_in_progress: yield self.archive_generator.WriteFileFooter() for chunk in self._WriteDescription(): yield chunk yield self.archive_generator.Close()