Exemplo n.º 1
0
    def testBigQueryPluginWithEarlyFlush(self):
        responses = []
        for i in range(10):
            responses.append(
                rdf_client_fs.StatEntry(
                    pathspec=rdf_paths.PathSpec(path="/foo/bar/%d" % i,
                                                pathtype="OS"),
                    st_mode=33184,  # octal = 100640 => u=rw,g=r,o= => -rw-r-----
                    st_ino=1063090,
                    st_dev=64512,
                    st_nlink=1 + i,
                    st_uid=139592,
                    st_gid=5000,
                    st_size=0,
                    st_atime=1336469177,
                    st_mtime=1336129892,
                    st_ctime=1336129892,
                    st_btime=1338111338))

        sizes = [37, 687, 722, 755, 788, 821, 684, 719, 752, 785]

        def GetSize(unused_path):
            return sizes.pop(0)

        # Force an early flush. Gzip is non deterministic since our
        # metadata is a dict with unpredictable order so we make up the file sizes
        # such that there is one flush during processing.
        with test_lib.ConfigOverrider({"BigQuery.max_file_post_size": 800}):
            with utils.Stubber(os.path, "getsize", GetSize):
                output = self.ProcessResponses(
                    plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(),
                    responses=responses)

        self.assertLen(output, 2)
        # Check that the output is still consistent
        actual_fds = []

        for _, stream, _, _ in output:
            actual_fds.append(gzip.GzipFile(None, "r", 9, stream))

        # Compare to our stored data.
        # TODO(user): there needs to be a better way to generate these files on
        # change than breaking into the debugger.
        expected_fd = open(
            os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                         "ExportedFile.jsonlines"), "rb")

        # Check that the same entries we expect are spread across the two files.
        counter = 0
        for actual_fd in actual_fds:
            for actual, expected in zip(actual_fd, expected_fd):
                actual = actual.decode("utf-8")
                expected = expected.decode("utf-8")
                self.assertEqual(json.Parse(actual), json.Parse(expected))
                counter += 1

        self.assertEqual(counter, 10)
Exemplo n.º 2
0
    def testBigQueryPluginWithValuesOfSameType(self):
        responses = []
        for i in range(10):
            responses.append(
                rdf_client_fs.StatEntry(
                    pathspec=rdf_paths.PathSpec(path="/foo/bar/%d" % i,
                                                pathtype="OS"),
                    st_mode=33184,  # octal = 100640 => u=rw,g=r,o= => -rw-r-----
                    st_ino=1063090,
                    st_dev=64512,
                    st_nlink=1 + i,
                    st_uid=139592,
                    st_gid=5000,
                    st_size=0,
                    st_atime=1336469177,
                    st_mtime=1336129892,
                    st_ctime=1336129892,
                    st_btime=1338111338))

        output = self.ProcessResponses(
            plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(),
            responses=responses)

        self.assertLen(output, 1)
        _, stream, schema, job_id = output[0]

        self.assertEqual(job_id,
                         "C-1000000000000000_Results_ExportedFile_1445995873")

        self.CompareSchemaToKnownGood(schema)

        actual_fd = gzip.GzipFile(
            None, "r",
            bigquery_plugin.BigQueryOutputPlugin.GZIP_COMPRESSION_LEVEL,
            stream)

        # Compare to our stored data.
        expected_fd = open(
            os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                         "ExportedFile.jsonlines"), "rb")

        # Bigquery expects a newline separarted list of JSON dicts, but this isn't
        # valid JSON so we can't just load the whole thing and compare.
        counter = 0
        for actual, expected in zip(actual_fd, expected_fd):
            actual = actual.decode("utf-8")
            expected = expected.decode("utf-8")
            self.assertEqual(json.Parse(actual), json.Parse(expected))
            counter += 1

        self.assertEqual(counter, 10)
Exemplo n.º 3
0
    def testMissingTimestampSerialization(self):
        response = rdf_client_fs.StatEntry()
        response.pathspec.pathtype = rdf_paths.PathSpec.PathType.OS
        response.pathspec.path = "/foo/bar"
        response.st_mtime = None

        args = bigquery_plugin.BigQueryOutputPluginArgs()

        output = self.ProcessResponses(plugin_args=args, responses=[response])
        self.assertLen(output, 1)

        _, filedesc, _, _ = output[0]
        with gzip.GzipFile(mode="r", fileobj=filedesc) as filedesc:
            content = json.Parse(filedesc.read().decode("utf-8"))

        self.assertIsNone(content["st_mtime"])
Exemplo n.º 4
0
    def testBinaryDataExportDisabled(self):
        response = rdf_client_fs.BlobImageChunkDescriptor()
        response.digest = b"\x00\xff\x00\xff\x00"

        args = bigquery_plugin.BigQueryOutputPluginArgs()
        args.base64_bytes_export = False

        output = self.ProcessResponses(plugin_args=args, responses=[response])

        self.assertLen(output, 1)
        _, filedesc, _, _ = output[0]

        with gzip.GzipFile(mode="r", fileobj=filedesc) as filedesc:
            content = json.Parse(filedesc.read().decode("utf-8"))

        self.assertNotIn("digest", content)
Exemplo n.º 5
0
    def testBigQueryPluginWithValuesOfMultipleTypes(self):
        output = self.ProcessResponses(
            plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(),
            responses=[
                rdf_client_fs.StatEntry(pathspec=rdf_paths.PathSpec(
                    path="/中国新闻网新闻中", pathtype="OS")),
                rdf_client.Process(pid=42)
            ],
            process_responses_separately=True)

        # Should have two separate output streams for the two types
        self.assertLen(output, 2)

        for name, stream, _, job_id in output:
            self.assertIn(job_id, [
                "C-1000000000000000_Results_ExportedFile_1445995873",
                "C-1000000000000000_Results_ExportedProcess_1445995873"
            ])
            self._parseOutput(name, stream)
Exemplo n.º 6
0
  def testBigQueryPluginFallbackToAFF4(self):
    plugin_args = bigquery_plugin.BigQueryOutputPluginArgs()
    responses = [
        rdf_client.StatEntry(
            pathspec=rdf_paths.PathSpec(path="/中国新闻网新闻中", pathtype="OS")),
        rdf_client.Process(pid=42),
        rdf_client.Process(pid=43),
        rdf_client.SoftwarePackage(name="test.deb")
    ]

    plugin = bigquery_plugin.BigQueryOutputPlugin(
        source_urn=self.results_urn,
        output_base_urn=self.base_urn,
        args=plugin_args,
        token=self.token)

    plugin.InitializeState()

    messages = []
    for response in responses:
      messages.append(
          rdf_flows.GrrMessage(source=self.client_id, payload=response))

    with test_lib.FakeTime(1445995873):
      with mock.patch.object(bigquery, "GetBigQueryClient") as mock_bigquery:
        mock_bigquery.return_value.configure_mock(**{
            "InsertData.side_effect": bigquery.BigQueryJobUploadError()
        })
        with test_lib.ConfigOverrider({"BigQuery.max_upload_failures": 2}):
          for message in messages:
            plugin.ProcessResponses([message])
          plugin.Flush()

          # We have 3 output types but a limit of 2 upload failures, so we
          # shouldn't try the third one.
          self.assertEqual(mock_bigquery.return_value.InsertData.call_count, 2)

    # We should have written a data file and a schema file for each type.
    for output_name in [
        "ExportedFile", "ExportedProcess", "AutoExportedSoftwarePackage"
    ]:
      schema_fd = aff4.FACTORY.Open(
          self.base_urn.Add(
              "C-1000000000000000_Results_%s_1445995873.schema" % output_name),
          token=self.token)
      data_fd = aff4.FACTORY.Open(
          self.base_urn.Add(
              "C-1000000000000000_Results_%s_1445995873.data" % output_name),
          token=self.token)
      actual_fd = gzip.GzipFile(None, "r", 9, data_fd)

      if output_name == "ExportedFile":
        self.CompareSchemaToKnownGood(json.load(schema_fd))
        self.assertEqual(
            json.load(actual_fd)["urn"], self.client_id.Add("/fs/os/中国新闻网新闻中"))
      elif output_name == "ExportedProcess":
        self.assertEqual(json.load(schema_fd)[1]["name"], "pid")
        expected_pids = ["42", "43"]
        for i, line in enumerate(actual_fd):
          self.assertEqual(json.loads(line)["pid"], expected_pids[i])
      else:
        self.assertEqual(json.load(schema_fd)[1]["name"], "name")
        self.assertEqual(json.load(actual_fd)["name"], "test.deb")

    # Process the same messages to make sure we're re-using the filehandles.
    with test_lib.FakeTime(1445995878):
      with mock.patch.object(bigquery, "GetBigQueryClient") as mock_bigquery:
        mock_bigquery.return_value.configure_mock(**{
            "InsertData.side_effect": bigquery.BigQueryJobUploadError()
        })
        with test_lib.ConfigOverrider({"BigQuery.max_upload_failures": 2}):
          for message in messages:
            plugin.ProcessResponses([message])
          plugin.Flush()

          # We shouldn't call insertdata at all because we have passed max
          # failures already
          self.assertEqual(mock_bigquery.return_value.InsertData.call_count, 0)

    expected_line_counts = {
        "ExportedFile": 2,
        "ExportedProcess": 4,
        "AutoExportedSoftwarePackage": 2
    }
    for output_name in [
        "ExportedFile", "ExportedProcess", "AutoExportedSoftwarePackage"
    ]:
      data_fd = aff4.FACTORY.Open(
          self.base_urn.Add(
              "C-1000000000000000_Results_%s_1445995873.data" % output_name),
          token=self.token)
      actual_fd = gzip.GzipFile(None, "r", 9, data_fd)
      self.assertEqual(
          sum(1 for line in actual_fd), expected_line_counts[output_name])