def testBigQueryPluginWithEarlyFlush(self): responses = [] for i in range(10): responses.append( rdf_client.StatEntry( pathspec=rdf_paths.PathSpec(path="/foo/bar/%d" % i, pathtype="OS"), st_mode=33184, # octal = 100640 => u=rw,g=r,o= => -rw-r----- st_ino=1063090, st_dev=64512L, st_nlink=1 + i, st_uid=139592, st_gid=5000, st_size=0, st_atime=1336469177, st_mtime=1336129892, st_ctime=1336129892)) sizes = [37, 687, 722, 755, 788, 821, 684, 719, 752, 785] def GetSize(unused_path): return sizes.pop(0) # Force an early flush. Gzip is non deterministic since our # metadata is a dict with unpredictable order so we make up the file sizes # such that there is one flush during processing. with test_lib.ConfigOverrider({"BigQuery.max_file_post_size": 800}): with utils.Stubber(os.path, "getsize", GetSize): output = self.ProcessResponses( plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(), responses=responses) self.assertEqual(len(output), 2) # Check that the output is still consistent actual_fds = [] for _, stream, _, _ in output: actual_fds.append(gzip.GzipFile(None, "r", 9, stream)) # Compare to our stored data. # TODO(user): there needs to be a better way to generate these files on # change than breaking into the debugger. expected_fd = open( os.path.join(config.CONFIG["Test.data_dir"], "bigquery", "ExportedFile.json"), "rb") # Check that the same entries we expect are spread across the two files. counter = 0 for actual_fd in actual_fds: for actual, expected in zip(actual_fd, expected_fd): self.assertEqual(json.loads(actual), json.loads(expected)) counter += 1 self.assertEqual(counter, 10)
def testBigQueryPluginWithValuesOfSameType(self): responses = [] for i in range(10): responses.append( rdf_client.StatEntry( pathspec=rdf_paths.PathSpec(path="/foo/bar/%d" % i, pathtype="OS"), st_mode=33184, # octal = 100640 => u=rw,g=r,o= => -rw-r----- st_ino=1063090, st_dev=64512L, st_nlink=1 + i, st_uid=139592, st_gid=5000, st_size=0, st_atime=1336469177, st_mtime=1336129892, st_ctime=1336129892)) output = self.ProcessResponses( plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(), responses=responses) self.assertEqual(len(output), 1) _, stream, schema, job_id = output[0] self.assertEqual(job_id, "C-1000000000000000_Results_ExportedFile_1445995873") self.CompareSchemaToKnownGood(schema) actual_fd = gzip.GzipFile( None, "r", bigquery_plugin.BigQueryOutputPlugin.GZIP_COMPRESSION_LEVEL, stream) # Compare to our stored data. expected_fd = open( os.path.join(config.CONFIG["Test.data_dir"], "bigquery", "ExportedFile.json"), "rb") # Bigquery expects a newline separarted list of JSON dicts, but this isn't # valid JSON so we can't just load the whole thing and compare. counter = 0 for actual, expected in zip(actual_fd, expected_fd): self.assertEqual(json.loads(actual), json.loads(expected)) counter += 1 self.assertEqual(counter, 10)
def testBigQueryPluginWithValuesOfMultipleTypes(self): output = self.ProcessResponses( plugin_args=bigquery_plugin.BigQueryOutputPluginArgs(), responses=[ rdf_client.StatEntry(pathspec=rdf_paths.PathSpec( path="/中国新闻网新闻中", pathtype="OS")), rdf_client.Process(pid=42) ], process_responses_separately=True) # Should have two separate output streams for the two types self.assertEqual(len(output), 2) for name, stream, _, job_id in output: self.assertTrue(job_id in [ "C-1000000000000000_Results_ExportedFile_1445995873", "C-1000000000000000_Results_ExportedProcess_1445995873" ]) self._parseOutput(name, stream)
def testBigQueryPluginFallbackToAFF4(self): plugin_args = bigquery_plugin.BigQueryOutputPluginArgs() responses = [ rdf_client.StatEntry( pathspec=rdf_paths.PathSpec(path="/中国新闻网新闻中", pathtype="OS")), rdf_client.Process(pid=42), rdf_client.Process(pid=43), rdf_client.SoftwarePackage(name="test.deb") ] plugin = bigquery_plugin.BigQueryOutputPlugin( source_urn=self.results_urn, output_base_urn=self.base_urn, args=plugin_args, token=self.token) plugin.InitializeState() messages = [] for response in responses: messages.append( rdf_flows.GrrMessage(source=self.client_id, payload=response)) with test_lib.FakeTime(1445995873): with mock.patch.object(bigquery, "GetBigQueryClient") as mock_bigquery: mock_bigquery.return_value.configure_mock(**{ "InsertData.side_effect": bigquery.BigQueryJobUploadError() }) with test_lib.ConfigOverrider( {"BigQuery.max_upload_failures": 2}): for message in messages: plugin.ProcessResponses([message]) plugin.Flush() # We have 3 output types but a limit of 2 upload failures, so we # shouldn't try the third one. self.assertEqual( mock_bigquery.return_value.InsertData.call_count, 2) # We should have written a data file and a schema file for each type. for output_name in [ "ExportedFile", "ExportedProcess", "AutoExportedSoftwarePackage" ]: schema_fd = aff4.FACTORY.Open(self.base_urn.Add( "C-1000000000000000_Results_%s_1445995873.schema" % output_name), token=self.token) data_fd = aff4.FACTORY.Open(self.base_urn.Add( "C-1000000000000000_Results_%s_1445995873.data" % output_name), token=self.token) actual_fd = gzip.GzipFile(None, "r", 9, data_fd) if output_name == "ExportedFile": self.CompareSchemaToKnownGood(json.load(schema_fd)) self.assertEqual( json.load(actual_fd)["urn"], self.client_id.Add("/fs/os/中国新闻网新闻中")) elif output_name == "ExportedProcess": self.assertEqual(json.load(schema_fd)[1]["name"], "pid") expected_pids = ["42", "43"] for i, line in enumerate(actual_fd): self.assertEqual(json.loads(line)["pid"], expected_pids[i]) else: self.assertEqual(json.load(schema_fd)[1]["name"], "name") self.assertEqual(json.load(actual_fd)["name"], "test.deb") # Process the same messages to make sure we're re-using the filehandles. with test_lib.FakeTime(1445995878): with mock.patch.object(bigquery, "GetBigQueryClient") as mock_bigquery: mock_bigquery.return_value.configure_mock(**{ "InsertData.side_effect": bigquery.BigQueryJobUploadError() }) with test_lib.ConfigOverrider( {"BigQuery.max_upload_failures": 2}): for message in messages: plugin.ProcessResponses([message]) plugin.Flush() # We shouldn't call insertdata at all because we have passed max # failures already self.assertEqual( mock_bigquery.return_value.InsertData.call_count, 0) expected_line_counts = { "ExportedFile": 2, "ExportedProcess": 4, "AutoExportedSoftwarePackage": 2 } for output_name in [ "ExportedFile", "ExportedProcess", "AutoExportedSoftwarePackage" ]: data_fd = aff4.FACTORY.Open(self.base_urn.Add( "C-1000000000000000_Results_%s_1445995873.data" % output_name), token=self.token) actual_fd = gzip.GzipFile(None, "r", 9, data_fd) self.assertEqual(sum(1 for line in actual_fd), expected_line_counts[output_name])