Пример #1
0
    def _GenerateConvertedValues(self, converter, grr_messages):
        """Generates converted values using given converter from given messages.

    Groups values in batches of BATCH_SIZE size and applies the converter
    to each batch.

    Args:
      converter: ExportConverter instance.
      grr_messages: An iterable (a generator is assumed) with GRRMessage values.

    Yields:
      Values generated by the converter.

    Raises:
      ValueError: if any of the GrrMessage objects doesn't have "source" set.
    """
        for batch in utils.Grouper(grr_messages, self.BATCH_SIZE):
            batch_with_metadata = []
            for grr_message in batch:
                if not grr_message.source:
                    raise ValueError("GrrMessage's source can't be empty")

                metadata = self.GetDefaultMetadata()
                metadata.client_urn = grr_message.source
                batch_with_metadata.append((metadata, grr_message.payload))

            for result in converter.BatchConvert(batch_with_metadata,
                                                 token=self.token):
                yield result
Пример #2
0
def _GetHWInfos(client_list, batch_size=10000, token=None):
    """Opens the given clients in batches and returns hardware information."""

    # This function returns a dict mapping each client_id to a set of reported
    # hardware serial numbers reported by this client.
    hw_infos = {}

    logging.info("%d clients to process.", len(client_list))

    c = 0

    for batch in utils.Grouper(client_list, batch_size):
        logging.info("Processing batch: %d-%d", c, c + batch_size)
        c += len(batch)

        client_objs = aff4.FACTORY.MultiOpen(batch,
                                             age=aff4.ALL_TIMES,
                                             token=token)

        for client in client_objs:
            hwi = client.GetValuesForAttribute(client.Schema.HARDWARE_INFO)

            hw_infos[client.urn] = set(["%s" % x.serial_number for x in hwi])

    return hw_infos
Пример #3
0
  def ProcessHuntResults(self, results, freeze_timestamp):
    plugins_exceptions = {}

    hunt_urn = results.Get(results.Schema.RESULTS_SOURCE)
    metadata_urn = hunt_urn.Add("ResultsMetadata")

    batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE
    batches = utils.Grouper(results.GenerateUncompactedItems(
        max_reversed_results=self.MAX_REVERSED_RESULTS,
        timestamp=freeze_timestamp), batch_size)

    with aff4.FACTORY.Open(
        metadata_urn, mode="rw", token=self.token) as metadata_obj:

      output_plugins = metadata_obj.Get(metadata_obj.Schema.OUTPUT_PLUGINS)
      num_processed = int(metadata_obj.Get(
          metadata_obj.Schema.NUM_PROCESSED_RESULTS))

      used_plugins = []
      for batch_index, batch in enumerate(batches):
        batch = list(batch)
        num_processed += len(batch)

        if not used_plugins:
          for _, (plugin_def, state) in output_plugins.data.iteritems():
            # TODO(user): Remove as soon as migration to new-style
            # output plugins is completed.
            if not hasattr(plugin_def, "GetPluginForState"):
              logging.error("Invalid plugin_def: %s", plugin_def)
              continue

            used_plugins.append((plugin_def,
                                 plugin_def.GetPluginForState(state)))

        batch_exceptions = self.ApplyPluginsToBatch(hunt_urn, used_plugins,
                                                    batch, batch_index)
        if batch_exceptions:
          for key, value in batch_exceptions.items():
            plugins_exceptions.setdefault(key, []).append(value)

        self.HeartBeat()

        # If this flow is working for more than max_running_time - stop
        # processing.
        if self.CheckIfRunningTooLong():
          self.Log("Running for too long, skipping rest of batches for %s",
                   hunt_urn)
          break

      if not used_plugins:
        logging.debug("Got notification, but no results were processed for %s.",
                      hunt_urn)

      flush_exceptions = self.FlushPlugins(hunt_urn, used_plugins)
      plugins_exceptions.update(flush_exceptions)

      metadata_obj.Set(metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins))
      metadata_obj.Set(metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))

      return plugins_exceptions
Пример #4
0
  def ProcessSingleTypeExportedValues(self, original_value_type,
                                      exported_values):
    first_value = next(exported_values, None)
    if not first_value:
      return

    yield self.archive_generator.WriteFileHeader(
        "%s/%s/from_%s.yaml" % (self.path_prefix,
                                first_value.__class__.__name__,
                                original_value_type.__name__))
    yield self.archive_generator.WriteFileChunk(_SerializeToYaml(first_value))
    counter = 1
    for batch in utils.Grouper(exported_values, self.ROW_BATCH):
      counter += len(batch)
      buf = cStringIO.StringIO()
      for value in batch:
        buf.write("\n")
        buf.write(_SerializeToYaml(value))

      yield self.archive_generator.WriteFileChunk(buf.getvalue())
    yield self.archive_generator.WriteFileFooter()

    counts_for_original_type = self.export_counts.setdefault(
        original_value_type.__name__, dict())
    counts_for_original_type[first_value.__class__.__name__] = counter
Пример #5
0
    def Stop(self, reason=None):
        super(GenericHunt, self).Stop(reason=reason)

        started_flows = grr_collections.RDFUrnCollection(
            self.started_flows_collection_urn)

        num_terminated_flows = 0
        self.Log("Hunt stop. Terminating all the started flows.")

        # Delete hunt flows states.
        for flows_batch in utils.Grouper(started_flows,
                                         self.__class__.STOP_BATCH_SIZE):
            with queue_manager.QueueManager(token=self.token) as manager:
                manager.MultiDestroyFlowStates(flows_batch)

            with data_store.DB.GetMutationPool() as mutation_pool:
                for f in flows_batch:
                    flow.GRRFlow.MarkForTermination(
                        f,
                        reason="Parent hunt stopped.",
                        mutation_pool=mutation_pool)

            num_terminated_flows += len(flows_batch)

        # Delete hunt's requests and responses to ensure no more
        # processing is going to occur.
        with queue_manager.QueueManager(token=self.token) as manager:
            manager.DestroyFlowStates(self.session_id)

        self.Log("%d flows terminated.", num_terminated_flows)
Пример #6
0
  def DownloadCollectionFiles(self, collection, output_writer, prefix):
    """Download all files from the collection and deduplicate along the way."""

    hashes = set()
    for fd_urn_batch in utils.Grouper(self.ResultsToUrns(collection),
                                      self.BATCH_SIZE):
      self.HeartBeat()

      for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=self.token):
        self.state.total_files += 1

        # Any file-like object with data in AFF4 should inherit AFF4Stream.
        if isinstance(fd, aff4.AFF4Stream):
          archive_path = os.path.join(prefix, *fd.urn.Split())
          self.state.archived_files += 1

          sha256_hash = fd.Get(fd.Schema.HASH, rdf_crypto.Hash()).sha256
          content_path = os.path.join(prefix, "hashes", str(sha256_hash))
          if sha256_hash not in hashes:
            # Make sure size of the original file is passed. It's required
            # when output_writer is StreamingTarWriter.
            st = os.stat_result((0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0))
            output_writer.WriteFromFD(fd, content_path, st=st)
            hashes.add(sha256_hash)
            self.Log("Written contents: " + content_path)

          up_prefix = "../" * len(fd.urn.Split())
          output_writer.WriteSymlink(up_prefix + content_path, archive_path)
          self.Log("Written symlink %s -> %s", archive_path,
                   up_prefix + content_path)
Пример #7
0
  def Start(self):
    inactive_client_ttl = config_lib.CONFIG["DataRetention.inactive_client_ttl"]
    if not inactive_client_ttl:
      self.Log("TTL not set - nothing to do...")
      return

    exception_label = config_lib.CONFIG[
        "DataRetention.inactive_client_ttl_exception_label"]

    index = aff4.FACTORY.Create(client_index.MAIN_INDEX,
                                aff4_type=client_index.ClientIndex,
                                mode="rw",
                                token=self.token)

    client_urns = index.LookupClients(["."])

    deadline = rdfvalue.RDFDatetime().Now() - inactive_client_ttl

    for client_group in utils.Grouper(client_urns, 1000):
      inactive_client_urns = []
      for client in aff4.FACTORY.MultiOpen(client_group, mode="r",
                                           aff4_type=aff4_grr.VFSGRRClient,
                                           token=self.token):
        if exception_label in client.GetLabelsNames():
          continue

        if client.Get(client.Schema.LAST) < deadline:
          inactive_client_urns.append(client.urn)

      aff4.FACTORY.MultiDelete(inactive_client_urns, token=self.token)
      self.HeartBeat()
Пример #8
0
  def Start(self):
    tmp_ttl = config_lib.CONFIG["DataRetention.tmp_ttl"]
    if not tmp_ttl:
      self.Log("TTL not set - nothing to do...")
      return

    exception_label = config_lib.CONFIG[
        "DataRetention.tmp_ttl_exception_label"]

    tmp_root = aff4.FACTORY.Open("aff4:/tmp", mode="r", token=self.token)
    tmp_urns = list(tmp_root.ListChildren())

    deadline = rdfvalue.RDFDatetime().Now() - tmp_ttl

    for tmp_group in utils.Grouper(tmp_urns, 10000):
      expired_tmp_urns = []
      for tmp_obj in aff4.FACTORY.MultiOpen(tmp_group, mode="r",
                                            token=self.token):
        if exception_label in tmp_obj.GetLabelsNames():
          continue

        if tmp_obj.Get(tmp_obj.Schema.LAST) < deadline:
          expired_tmp_urns.append(tmp_obj.urn)

      aff4.FACTORY.MultiDelete(expired_tmp_urns, token=self.token)
      self.HeartBeat()
Пример #9
0
    def _GenerateConvertedValues(self, converter, grr_messages):
        """Generates converted values using given converter from given messages.

    Groups values in batches of BATCH_SIZE size and applies the converter
    to each batch.

    Args:
      converter: ExportConverter instance.
      grr_messages: An iterable (a generator is assumed) with GRRMessage values.

    Yields:
      Values generated by the converter.

    Raises:
      ValueError: if any of the GrrMessage objects doesn't have "source" set.
    """
        for batch in utils.Grouper(grr_messages, self.BATCH_SIZE):
            metadata_items = self._GetMetadataForClients(
                [gm.source for gm in batch])
            batch_with_metadata = zip(metadata_items,
                                      [gm.payload for gm in batch])

            for result in converter.BatchConvert(batch_with_metadata,
                                                 token=self.token):
                yield result
Пример #10
0
  def Start(self):
    """Retrieve all the clients for the AbstractClientStatsCollectors."""
    self.stats = aff4.FACTORY.Create(
        self.FILESTORE_STATS_URN,
        aff4_stats.FilestoreStats,
        mode="w",
        token=self.token)

    self._CreateConsumers()
    hashes = aff4.FACTORY.Open(
        self.HASH_PATH, token=self.token).ListChildren(limit=10**8)

    try:
      for urns in utils.Grouper(hashes, self.OPEN_FILES_LIMIT):
        for fd in aff4.FACTORY.MultiOpen(
            urns, mode="r", token=self.token, age=aff4.NEWEST_TIME):

          for consumer in self.consumers:
            consumer.ProcessFile(fd)
        self.HeartBeat()

    finally:
      for consumer in self.consumers:
        consumer.Save(self.stats)
      self.stats.Close()
Пример #11
0
  def ProcessOneHunt(self, exceptions_by_hunt):
    """Reads results for one hunt and process them."""
    hunt_results_urn, results = (
        hunts_results.HuntResultQueue.ClaimNotificationsForCollection(
            start_time=self.args.start_processing_time,
            token=self.token,
            lease_time=self.lifetime))
    logging.debug("Found %d results for hunt %s", len(results),
                  hunt_results_urn)
    if not results:
      return 0

    hunt_urn = rdfvalue.RDFURN(hunt_results_urn.Dirname())
    batch_size = self.args.batch_size or self.DEFAULT_BATCH_SIZE
    metadata_urn = hunt_urn.Add("ResultsMetadata")
    exceptions_by_plugin = {}
    num_processed_for_hunt = 0
    collection_obj = implementation.GRRHunt.ResultCollectionForHID(
        hunt_urn, token=self.token)
    try:
      with aff4.FACTORY.OpenWithLock(
          metadata_urn, lease_time=600, token=self.token) as metadata_obj:
        all_plugins, used_plugins = self.LoadPlugins(metadata_obj)
        num_processed = int(
            metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS))
        for batch in utils.Grouper(results, batch_size):
          results = list(
              collection_obj.MultiResolve([(ts, suffix)
                                           for (_, ts, suffix) in batch]))
          self.RunPlugins(hunt_urn, used_plugins, results, exceptions_by_plugin)

          hunts_results.HuntResultQueue.DeleteNotifications(
              [record_id for (record_id, _, _) in batch], token=self.token)
          num_processed += len(batch)
          num_processed_for_hunt += len(batch)
          self.HeartBeat()
          metadata_obj.Set(
              metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))
          metadata_obj.UpdateLease(600)
          if self.CheckIfRunningTooLong():
            logging.warning("Run too long, stopping.")
            break

        metadata_obj.Set(metadata_obj.Schema.OUTPUT_PLUGINS(all_plugins))
        metadata_obj.Set(
            metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))
    except aff4.LockError:
      logging.warn("ProcessHuntResultCollectionsCronFlow: "
                   "Could not get lock on hunt metadata %s.", metadata_urn)
      return 0

    if exceptions_by_plugin:
      for plugin, exceptions in exceptions_by_plugin.items():
        exceptions_by_hunt.setdefault(hunt_urn, {}).setdefault(
            plugin, []).extend(exceptions)

    logging.debug("Processed %d results.", num_processed_for_hunt)
    return len(results)
Пример #12
0
  def Convert(self, metadata, collection, token=None):
    if not collection:
      return

    for batch in utils.Grouper(collection, self.BATCH_SIZE):
      converted_batch = ConvertValues(metadata, batch, token=token,
                                      options=self.options)
      for v in converted_batch:
        yield v
Пример #13
0
 def GetClientStates(self, client_list, client_chunk=50):
   """Take in a client list and return dicts with their age and hostname."""
   for client_group in utils.Grouper(client_list, client_chunk):
     for fd in aff4.FACTORY.MultiOpen(client_group, mode="r",
                                      aff4_type="VFSGRRClient",
                                      token=self.token):
       result = {}
       result["age"] = fd.Get(fd.Schema.PING)
       result["hostname"] = fd.Get(fd.Schema.HOSTNAME)
       yield (fd.urn, result)
Пример #14
0
    def Generate(self, collection, token=None):
        """Generates archive from a given collection.

    Iterates the collection and generates an archive by yielding contents
    of every referenced AFF4Stream.

    Args:
      collection: Iterable with items that point to aff4 paths.
      token: User's ACLToken.

    Yields:
      Binary chunks comprising the generated archive.
    """
        hashes = set()
        for fd_urn_batch in utils.Grouper(self._ItemsToUrns(collection),
                                          self.BATCH_SIZE):

            for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=token):
                self.total_files += 1

                # Any file-like object with data in AFF4 should inherit AFF4Stream.
                if isinstance(fd, aff4.AFF4Stream):
                    archive_path = os.path.join(self.prefix, *fd.urn.Split())

                    sha256_hash = fd.Get(fd.Schema.HASH,
                                         rdf_crypto.Hash()).sha256
                    if not sha256_hash:
                        continue
                    self.archived_files += 1

                    content_path = os.path.join(self.prefix, "hashes",
                                                str(sha256_hash))
                    if sha256_hash not in hashes:
                        # Make sure size of the original file is passed. It's required
                        # when output_writer is StreamingTarWriter.
                        st = os.stat_result(
                            (0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0))
                        try:
                            for chunk in self.archive_generator.WriteFromFD(
                                    fd, content_path, st=st):
                                yield chunk

                            hashes.add(sha256_hash)
                        except Exception:  # pylint: disable=broad-except
                            self.failed_files += 1
                            continue

                    up_prefix = "../" * len(fd.urn.Split())
                    yield self.archive_generator.WriteSymlink(
                        up_prefix + content_path, archive_path)

        for chunk in self._WriteDescription():
            yield chunk

        yield self.archive_generator.Close()
Пример #15
0
    def ProcessSingleTypeExportedValues(self, original_value_type,
                                        exported_values):
        first_value = next(exported_values, None)
        if not first_value:
            return

        if not isinstance(first_value, rdf_structs.RDFProtoStruct):
            raise ValueError("The SQLite plugin only supports export-protos")
        yield self.archive_generator.WriteFileHeader(
            "%s/%s_from_%s.sql" %
            (self.path_prefix, first_value.__class__.__name__,
             original_value_type.__name__))
        table_name = "%s.from_%s" % (first_value.__class__.__name__,
                                     original_value_type.__name__)
        schema = self._GetSqliteSchema(first_value.__class__)

        # We will buffer the sql statements into an in-memory sql database before
        # dumping them to the zip archive. We rely on the PySQLite library for
        # string escaping.
        db_connection = sqlite3.connect(":memory:")
        db_cursor = db_connection.cursor()

        yield self.archive_generator.WriteFileChunk("BEGIN TRANSACTION;\n")
        with db_connection:
            buf = cStringIO.StringIO()
            buf.write("CREATE TABLE \"%s\" (\n  " % table_name)
            column_types = [(k, v.sqlite_type) for k, v in schema.items()]
            buf.write(",\n  ".join(
                ["\"%s\" %s" % (k, v) for k, v in column_types]))
            buf.write("\n);")
            db_cursor.execute(buf.getvalue())
            yield self.archive_generator.WriteFileChunk(buf.getvalue() + "\n")
            self._InsertValueIntoDb(table_name, schema, first_value, db_cursor)

        for sql in self._FlushAllRows(db_connection, table_name):
            yield sql
        counter = 1
        for batch in utils.Grouper(exported_values, self.ROW_BATCH):
            counter += len(batch)
            with db_connection:
                for value in batch:
                    self._InsertValueIntoDb(table_name, schema, value,
                                            db_cursor)
            for sql in self._FlushAllRows(db_connection, table_name):
                yield sql

        db_connection.close()
        yield self.archive_generator.WriteFileChunk("COMMIT;\n")
        yield self.archive_generator.WriteFileFooter()

        counts_for_original_type = self.export_counts.setdefault(
            original_value_type.__name__, dict())
        counts_for_original_type[first_value.__class__.__name__] = counter
Пример #16
0
    def ProcessOneHunt(self, exceptions_by_hunt):
        """Reads results for one hunt and process them."""
        hunt_results_urn, results = (
            hunts_results.HuntResultQueue.ClaimNotificationsForCollection(
                token=self.token, lease_time=self.lifetime))
        if not results:
            return 0

        hunt_urn = rdfvalue.RDFURN(hunt_results_urn.Dirname())
        batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE
        metadata_urn = hunt_urn.Add("ResultsMetadata")
        exceptions_by_plugin = {}
        with aff4.FACTORY.OpenWithLock(hunt_results_urn,
                                       aff4_type="HuntResultCollection",
                                       lease_time=600,
                                       token=self.token) as collection_obj:
            with aff4.FACTORY.OpenWithLock(metadata_urn,
                                           lease_time=600,
                                           token=self.token) as metadata_obj:
                all_plugins, used_plugins = self.LoadPlugins(metadata_obj)
                num_processed = int(
                    metadata_obj.Get(
                        metadata_obj.Schema.NUM_PROCESSED_RESULTS))
                for batch in utils.Grouper(results, batch_size):
                    results = list(
                        collection_obj.MultiResolve([
                            (ts, suffix) for (_, ts, suffix) in batch
                        ]))
                    self.RunPlugins(hunt_urn, used_plugins, results,
                                    exceptions_by_plugin)

                    hunts_results.HuntResultQueue.DeleteNotifications(
                        [record_id for (record_id, _, _) in batch],
                        token=self.token)
                    num_processed += len(batch)
                    self.HeartBeat()
                    collection_obj.UpdateLease(600)
                    metadata_obj.Set(
                        metadata_obj.Schema.NUM_PROCESSED_RESULTS(
                            num_processed))
                    metadata_obj.UpdateLease(600)
                    if self.CheckIfRunningTooLong():
                        break
                self.FlushPlugins(hunt_urn, used_plugins, exceptions_by_plugin)
                metadata_obj.Set(
                    metadata_obj.Schema.OUTPUT_PLUGINS(all_plugins))
                metadata_obj.Set(
                    metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))
        if exceptions_by_plugin:
            for plugin, exceptions in exceptions_by_plugin.items():
                exceptions_by_hunt.setdefault(hunt_urn, {}).setdefault(
                    plugin, []).extend(exceptions)
        return len(results)
Пример #17
0
    def ProcessHuntResults(self, results):
        plugins_exceptions = None

        hunt_urn = results.Get(results.Schema.RESULTS_SOURCE)
        metadata_urn = hunt_urn.Add("ResultsMetadata")

        batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE
        batches = utils.Grouper(
            results.GenerateUncompactedItems(
                max_reversed_results=self.MAX_REVERSED_RESULTS), batch_size)

        with aff4.FACTORY.Open(metadata_urn, mode="rw",
                               token=self.token) as metadata_obj:

            output_plugins = metadata_obj.Get(
                metadata_obj.Schema.OUTPUT_PLUGINS)
            num_processed = int(
                metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS))

            used_plugins = {}
            for batch_index, batch in enumerate(batches):
                batch = list(batch)
                num_processed += len(batch)

                if not used_plugins:
                    for plugin_name, (
                            plugin_def,
                            state) in output_plugins.data.iteritems():
                        used_plugins[
                            plugin_name] = plugin_def.GetPluginForState(state)

                plugins_exceptions = self.ApplyPluginsToBatch(
                    hunt_urn, used_plugins, batch, batch_index)
                self.HeartBeat()

                # If this flow is working for more than max_running_time - stop
                # processing.
                if self.CheckIfRunningTooLong():
                    self.Log(
                        "Running for too long, skipping rest of batches for %s",
                        hunt_urn)
                    break

            flush_exceptions = self.FlushPlugins(hunt_urn, used_plugins)
            plugins_exceptions.update(flush_exceptions)

            metadata_obj.Set(
                metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins))
            metadata_obj.Set(
                metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))

            return plugins_exceptions
Пример #18
0
 def GetInput(self):
   """Yield client urns."""
   client_list = GetAllClients(token=self.token)
   logging.debug("Got %d clients", len(client_list))
   for client_group in utils.Grouper(client_list, self.client_chunksize):
     for fd in aff4.FACTORY.MultiOpen(client_group, mode="r",
                                      aff4_type="VFSGRRClient",
                                      token=self.token):
       if isinstance(fd, aff4_grr.VFSGRRClient):
         # Skip if older than max_age
         oldest_time = (time.time() - self.max_age) * 1e6
       if fd.Get(aff4.VFSGRRClient.SchemaCls.PING) >= oldest_time:
         yield fd
Пример #19
0
def CleanVacuousVersions(clients=None, dry_run=True):
    """A script to remove no-op client versions.

  This script removes versions of a client when it is identical to the previous,
  in the sense that no versioned attributes were changed since the previous
  client version.

  Args:
    clients: A list of ClientURN, if empty cleans all clients.
    dry_run: whether this is a dry run
  """

    if not clients:
        index = client_index.CreateClientIndex()
        clients = index.LookupClients(["."])
    clients.sort()
    with data_store.DB.GetMutationPool() as pool:

        logging.info("checking %d clients", len(clients))
        for batch in utils.Grouper(clients, 10000):
            # TODO(amoser): This only works on datastores that use the Bigtable
            # scheme.
            client_infos = data_store.DB.MultiResolvePrefix(
                batch, ["aff4:", "aff4:"], data_store.DB.ALL_TIMESTAMPS)

            for client, type_list in client_infos:
                cleared = 0
                kept = 0
                updates = []
                for a, _, ts in type_list:
                    if ts != 0:
                        updates.append((ts, a))
                updates = sorted(updates)
                dirty = True
                for ts, a in updates:
                    if a == "aff4:type":
                        if dirty:
                            kept += 1
                            dirty = False
                        else:
                            cleared += 1
                            if not dry_run:
                                pool.DeleteAttributes(client, ["aff4:type"],
                                                      start=ts,
                                                      end=ts)
                                if pool.Size() > 1000:
                                    pool.Flush()
                    else:
                        dirty = True
                logging.info("%s: kept %d and cleared %d", client, kept,
                             cleared)
Пример #20
0
  def _ProcessValuesWithOutputPlugin(self, values, output_plugin, args):
    """Processes given values with given output plugin."""

    checkpoints = utils.Grouper(values, args.checkpoint_every)
    for index, checkpoint in enumerate(checkpoints):
      logging.info("Starting checkpoint %d.", index)
      batch_converter = HuntOutputPluginBatchConverter(
          batch_size=args.batch, threadpool_size=args.threads,
          output_plugin=output_plugin)
      batch_converter.Convert(checkpoint)

      logging.info("Checkpointing (checkpoint %d)...", index)
      output_plugin.Flush()
      logging.info("Checkpoint %d done.", index)
Пример #21
0
    def IterateAllClientSnapshots(self, batch_size=50000):
        """Iterates over all available clients and yields client snapshot objects.

    Args:
      batch_size: Always reads <batch_size> snapshots at a time.
    Yields:
      An rdfvalues.objects.ClientSnapshot object for each client in the db.
    """
        all_client_ids = self.ReadAllClientIDs()

        for batch in utils.Grouper(all_client_ids, batch_size):
            res = self.MultiReadClientSnapshot(batch)
            for snapshot in res.values():
                if snapshot:
                    yield snapshot
Пример #22
0
    def ProcessClients(self, unused_responses):
        """Does the work."""
        self.start = 0
        self.end = int(1e6 * (time.time() - self.MAX_AGE))

        client_urns = export_utils.GetAllClients(token=self.token)

        for batch in utils.Grouper(client_urns, 10000):
            with data_store.DB.GetMutationPool() as mutation_pool:
                for client_urn in batch:
                    mutation_pool.DeleteAttributes(client_urn.Add("stats"),
                                                   [u"aff4:stats"],
                                                   start=self.start,
                                                   end=self.end)
            self.HeartBeat()
Пример #23
0
  def IterateAllClientsFullInfo(self, batch_size=50000, min_last_ping=None):
    """Iterates over all available clients and yields full info protobufs.

    Args:
      batch_size: Always reads <batch_size> client full infos at a time.
      min_last_ping: If not None, only the clients with last ping time bigger
                     than min_last_ping will be returned.
    Yields:
      An rdfvalues.objects.ClientFullInfo object for each client in the db.
    """
    all_client_ids = self.ReadAllClientIDs()

    for batch in utils.Grouper(all_client_ids, batch_size):
      res = self.MultiReadClientFullInfo(batch, min_last_ping=min_last_ping)
      for full_info in res.values():
        yield full_info
Пример #24
0
    def _MultiStream(cls, fds):
        """Effectively streams data from multiple opened BlobImage objects.

    Args:
      fds: A list of opened AFF4Stream (or AFF4Stream descendants) objects.

    Yields:
      Tuples (chunk, fd, exception) where chunk is a binary blob of data and fd
      is an object from the fds argument.

      If one or more chunks are missing, exception is a MissingBlobsError object
      and chunk is None. _MultiStream does its best to skip the file entirely if
      one of its chunks is missing, but in case of very large files it's still
      possible to yield a truncated file.
    """

        broken_fds = set()
        missing_blobs_fd_pairs = []
        for chunk_fd_pairs in utils.Grouper(
                cls._GenerateChunkIds(fds),
                cls.MULTI_STREAM_CHUNKS_READ_AHEAD):
            results_map = data_store.DB.ReadBlobs(dict(chunk_fd_pairs).keys(),
                                                  token=fds[0].token)

            for chunk_id, fd in chunk_fd_pairs:
                if chunk_id not in results_map or results_map[chunk_id] is None:
                    missing_blobs_fd_pairs.append((chunk_id, fd))
                    broken_fds.add(fd)

            for chunk, fd in chunk_fd_pairs:
                if fd in broken_fds:
                    continue

                yield fd, results_map[chunk], None

        if missing_blobs_fd_pairs:
            missing_blobs_by_fd = {}
            for chunk_id, fd in missing_blobs_fd_pairs:
                missing_blobs_by_fd.setdefault(fd, []).append(chunk_id)

            for fd, missing_blobs in missing_blobs_by_fd.iteritems():
                e = MissingBlobsError("%d missing blobs (multi-stream)" %
                                      len(missing_blobs),
                                      missing_chunks=missing_blobs)
                yield fd, None, e
Пример #25
0
class SuspendableListDirectory(actions.SuspendableAction):
  """Lists a directory as a suspendable client action."""
  in_rdfvalue = rdf_client.ListDirRequest
  out_rdfvalues = [rdf_client.StatEntry]

  def Iterate(self):
    try:
      fd = vfs.VFSOpen(self.request.pathspec, progress_callback=self.Progress)
    except (IOError, OSError), e:
      self.SetStatus(rdf_flows.GrrStatus.ReturnedStatus.IOERROR, e)
      return

    length = self.request.iterator.number
    for group in utils.Grouper(fd.ListFiles(), length):
      for response in group:
        self.SendReply(response)

      self.Suspend()
Пример #26
0
  def Convert(self, values, start_index=0, end_index=None):
    """Converts given collection to exported values.

    This method uses a threadpool to do the conversion in parallel. It
    blocks until everything is converted.

    Args:
      values: Iterable object with values to convert.
      start_index: Start from this index in the collection.
      end_index: Finish processing on the (index - 1) element of the
                 collection. If None, work till the end of the collection.

    Returns:
      Nothing. ConvertedBatch() should handle the results.
    """
    if not values:
      return

    try:
      total_batch_count = len(values) / self.batch_size
    except TypeError:
      total_batch_count = -1

    pool = ThreadPool.Factory(self.threadpool_prefix, self.threadpool_size)
    val_iterator = itertools.islice(values, start_index, end_index)

    pool.Start()
    try:
      for batch_index, batch in enumerate(
          utils.Grouper(val_iterator, self.batch_size)):
        logging.debug("Processing batch %d out of %d", batch_index,
                      total_batch_count)

        pool.AddTask(
            target=self.ConvertBatch,
            args=(batch,),
            name="batch_%d" % batch_index,
            inline=False)

    finally:
      pool.Stop()
Пример #27
0
  def Execute(self, thread_count):
    """Runs the migration procedure.

    Args:
      thread_count: A number of threads to execute the migration with.

    Raises:
      AssertionError: If not all clients have been migrated.
      ValueError: If the relational database backend is not available.
    """
    if not data_store.RelationalDBWriteEnabled():
      raise ValueError("No relational database available.")

    sys.stdout.write("Collecting clients...\n")
    client_urns = _GetClientUrns()

    sys.stdout.write("Clients to migrate: {}\n".format(len(client_urns)))
    sys.stdout.write("Threads to use: {}\n".format(thread_count))

    self._total_count = len(client_urns)
    self._migrated_count = 0
    self._start_time = rdfvalue.RDFDatetime.Now()

    batches = utils.Grouper(client_urns, _CLIENT_BATCH_SIZE)

    self._Progress()
    tp = pool.ThreadPool(processes=thread_count)
    tp.map(self._MigrateBatch, list(batches))
    self._Progress()

    if self._migrated_count == self._total_count:
      message = "\nMigration has been finished (migrated {} clients).\n".format(
          self._migrated_count)
      sys.stdout.write(message)
    else:
      message = "Not all clients have been migrated ({}/{})".format(
          self._migrated_count, self._total_count)
      raise AssertionError(message)
Пример #28
0
    def ProcessSingleTypeExportedValues(self, original_value_type,
                                        exported_values):
        first_value = next(exported_values, None)
        if not first_value:
            return

        yield self.archive_generator.WriteFileHeader(
            "%s/%s/from_%s.csv" %
            (self.path_prefix, first_value.__class__.__name__,
             original_value_type.__name__))

        buf = cStringIO.StringIO()
        writer = csv.writer(buf)
        # Write the CSV header based on first value class and write
        # the first value itself. All other values are guaranteed
        # to have the same class (see ProcessSingleTypeExportedValues definition).
        writer.writerow(self._GetCSVHeader(first_value.__class__))
        writer.writerow(self._GetCSVRow(first_value))
        yield self.archive_generator.WriteFileChunk(buf.getvalue())

        # Counter starts from 1, as 1 value has already been written.
        counter = 1
        for batch in utils.Grouper(exported_values, self.ROW_BATCH):
            counter += len(batch)

            buf = cStringIO.StringIO()
            writer = csv.writer(buf)
            for value in batch:
                writer.writerow(self._GetCSVRow(value))

            yield self.archive_generator.WriteFileChunk(buf.getvalue())

        yield self.archive_generator.WriteFileFooter()

        self.export_counts.setdefault(
            original_value_type.__name__,
            dict())[first_value.__class__.__name__] = counter
Пример #29
0
    def DeprecatedProcessHunt(self, session_id):
        metadata_urn = session_id.Add("ResultsMetadata")
        last_exception = None

        with aff4.FACTORY.Open(metadata_urn, mode="rw",
                               token=self.token) as metadata_obj:

            output_plugins = metadata_obj.Get(
                metadata_obj.Schema.OUTPUT_PLUGINS)
            num_processed = int(
                metadata_obj.Get(metadata_obj.Schema.NUM_PROCESSED_RESULTS))
            raw_offset = int(
                metadata_obj.Get(
                    metadata_obj.Schema.DEPRECATED_COLLECTION_RAW_OFFSET))
            results = aff4.FACTORY.Open(session_id.Add("Results"),
                                        mode="r",
                                        token=self.token)

            batch_size = self.state.args.batch_size or self.DEFAULT_BATCH_SIZE
            batches = utils.Grouper(results.GenerateItems(offset=raw_offset),
                                    batch_size)

            used_plugins = {}
            for batch_index, batch in enumerate(batches):
                if not used_plugins:
                    for plugin_name, (
                            plugin_def,
                            state) in output_plugins.data.iteritems():
                        used_plugins[
                            plugin_name] = plugin_def.GetPluginForState(state)

                # If this flow is working for more than max_running_time - stop
                # processing.
                if self.state.args.max_running_time:
                    elapsed = (
                        rdfvalue.RDFDatetime().Now().AsSecondsFromEpoch() -
                        self.start_time.AsSecondsFromEpoch())
                    if elapsed > self.state.args.max_running_time:
                        self.Log(
                            "Running for too long, skipping rest of batches for %s.",
                            session_id)
                        break

                batch = list(batch)
                num_processed += len(batch)

                for plugin_name, plugin in used_plugins.iteritems():
                    logging.debug("Processing hunt %s with %s, batch %d",
                                  session_id, plugin_name, batch_index)

                    try:
                        plugin.ProcessResponses(batch)
                    except Exception as e:  # pylint: disable=broad-except
                        logging.exception(
                            "Error processing hunt results: hunt %s, "
                            "plugin %s, batch %d", session_id, plugin_name,
                            batch_index)
                        self.Log("Error processing hunt results (hunt %s, "
                                 "plugin %s, batch %d): %s" %
                                 (session_id, plugin_name, batch_index, e))
                        last_exception = e
                self.HeartBeat()

            for plugin in used_plugins.itervalues():
                try:
                    plugin.Flush()
                except Exception as e:  # pylint: disable=broad-except
                    logging.exception(
                        "Error flushing hunt results: hunt %s, "
                        "plugin %s", session_id, str(plugin))
                    self.Log("Error processing hunt results (hunt %s, "
                             "plugin %s): %s" % (session_id, str(plugin), e))
                    last_exception = e

            metadata_obj.Set(
                metadata_obj.Schema.OUTPUT_PLUGINS(output_plugins))
            metadata_obj.Set(
                metadata_obj.Schema.NUM_PROCESSED_RESULTS(num_processed))
            metadata_obj.Set(
                metadata_obj.Schema.DEPRECATED_COLLECTION_RAW_OFFSET(
                    results.deprecated_current_offset))

            # TODO(user): throw proper exception which will contain all the
            # exceptions that were raised while processing this hunt.
            if last_exception:
                raise last_exception  # pylint: disable=raising-bad-type
Пример #30
0
    def Generate(self, collection, token=None):
        """Generates archive from a given collection.

    Iterates the collection and generates an archive by yielding contents
    of every referenced AFF4Stream.

    Args:
      collection: Iterable with items that point to aff4 paths.
      token: User's ACLToken.

    Yields:
      Binary chunks comprising the generated archive.
    """
        hashes = set()
        for fd_urn_batch in utils.Grouper(self._ItemsToUrns(collection),
                                          self.BATCH_SIZE):

            fds_to_write = {}
            for fd in aff4.FACTORY.MultiOpen(fd_urn_batch, token=token):
                self.total_files += 1

                if not self.predicate(fd):
                    self.ignored_files.append(utils.SmartUnicode(fd.urn))
                    continue

                # Any file-like object with data in AFF4 should inherit AFF4Stream.
                if isinstance(fd, aff4.AFF4Stream):
                    archive_path = os.path.join(self.prefix, *fd.urn.Split())

                    sha256_hash = fd.Get(fd.Schema.HASH,
                                         rdf_crypto.Hash()).sha256
                    if not sha256_hash:
                        continue
                    self.archived_files += 1

                    content_path = os.path.join(self.prefix, "hashes",
                                                str(sha256_hash))
                    if sha256_hash not in hashes:
                        # Make sure size of the original file is passed. It's required
                        # when output_writer is StreamingTarWriter.
                        st = os.stat_result(
                            (0644, 0, 0, 0, 0, 0, fd.size, 0, 0, 0))
                        fds_to_write[fd] = (content_path, st)
                        hashes.add(sha256_hash)

                    up_prefix = "../" * len(fd.urn.Split())
                    yield self.archive_generator.WriteSymlink(
                        up_prefix + content_path, archive_path)

            if fds_to_write:
                prev_fd = None
                for fd, chunk, exception in aff4.AFF4Stream.MultiStream(
                        fds_to_write):
                    if exception:
                        logging.exception(exception)

                        self.archived_files -= 1
                        self.failed_files.append(utils.SmartUnicode(fd.urn))
                        continue

                    if prev_fd != fd:
                        if prev_fd:
                            yield self.archive_generator.WriteFileFooter()
                        prev_fd = fd

                        content_path, st = fds_to_write[fd]
                        yield self.archive_generator.WriteFileHeader(
                            content_path, st=st)

                    yield self.archive_generator.WriteFileChunk(chunk)

                if self.archive_generator.is_file_write_in_progress:
                    yield self.archive_generator.WriteFileFooter()

        for chunk in self._WriteDescription():
            yield chunk

        yield self.archive_generator.Close()