def test_v4execschema(): schema_spec = { "version": 2, "dimensions": [{ "field_name": "submissionDate", "allowed_values": { "max": "20150901" } }] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4execbucket, schema=schema): found.add(f.name) assert (len(found) == 3) assert ("20150901/20150901221519.541_ip-172-31-16-184" in found) assert ("20150901/20150901223019.579_ip-172-31-16-184" in found) assert ("20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a prefix: found = set() for f in s3util.list_heka_partitions( v4prefixbucket, prefix="telemetry-executive-summary-2", schema=schema): found.add(f.name) assert (len(found) == 3) assert ( "telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184" in found) assert ( "telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184" in found) assert ( "telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a bunch of prefixes: found = set() for f in s3util.list_heka_partitions(multiprefixbucket, prefix="a/b/c/d", schema=schema): found.add(f.name) assert (len(found) == 3) assert ("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found) assert ("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found) assert ("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
def test_v4execschema(): schema_spec = { "version": 2, "dimensions": [ { "field_name": "submissionDate", "allowed_values": {"max": "20150901"} } ] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4execbucket, schema=schema): found.add(f.name) assert(len(found) == 3) assert("20150901/20150901221519.541_ip-172-31-16-184" in found) assert("20150901/20150901223019.579_ip-172-31-16-184" in found) assert("20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a prefix: found = set() for f in s3util.list_heka_partitions(v4prefixbucket, prefix="telemetry-executive-summary-2", schema=schema): found.add(f.name) assert(len(found) == 3) assert("telemetry-executive-summary-2/20150901/20150901221519.541_ip-172-31-16-184" in found) assert("telemetry-executive-summary-2/20150901/20150901223019.579_ip-172-31-16-184" in found) assert("telemetry-executive-summary-2/20150901/20150901224519.623_ip-172-31-16-184" in found) # Test with a bunch of prefixes: found = set() for f in s3util.list_heka_partitions(multiprefixbucket, prefix="a/b/c/d", schema=schema): found.add(f.name) assert(len(found) == 3) assert("a/b/c/d/20150901/20150901221519.541_ip-172-31-16-184" in found) assert("a/b/c/d/20150901/20150901223019.579_ip-172-31-16-184" in found) assert("a/b/c/d/20150901/20150901224519.623_ip-172-31-16-184" in found)
def test_v4schema(): schema_spec = { "version": 2, "dimensions": [ { "field_name": "submissionDate", "allowed_values": "20150903" }, { "field_name": "sourceName", "allowed_values": "*" }, { "field_name": "sourceVersion", "allowed_values": "4" }, { "field_name": "docType", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": ["Firefox"] }, { "field_name": "appUpdateChannel", "allowed_values": ["release"] }, { "field_name": "appVersion", "allowed_values": "24.0" }, { "field_name": "appBuildId", "allowed_values": "20130910160258" } ] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4bucket, schema=schema): found.add(f.name) assert(len(found) == 3) assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184" in found) assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184" in found) assert("20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184" in found)
def test_v4schema(): schema_spec = { "version": 2, "dimensions": [{ "field_name": "submissionDate", "allowed_values": "20150903" }, { "field_name": "sourceName", "allowed_values": "*" }, { "field_name": "sourceVersion", "allowed_values": "4" }, { "field_name": "docType", "allowed_values": ["saved-session"] }, { "field_name": "appName", "allowed_values": ["Firefox"] }, { "field_name": "appUpdateChannel", "allowed_values": ["release"] }, { "field_name": "appVersion", "allowed_values": "24.0" }, { "field_name": "appBuildId", "allowed_values": "20130910160258" }] } schema = TelemetrySchema(schema_spec) found = set() for f in s3util.list_heka_partitions(v4bucket, schema=schema): found.add(f.name) assert (len(found) == 3) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051633.482_ip-172-31-16-184" in found) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051644.482_ip-172-31-16-184" in found) assert ( "20150903/telemetry/4/saved_session/Firefox/release/24.0/20130910160258/20150903051655.482_ip-172-31-16-184" in found)
def get_filtered_files_s3(self): if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. if self._aws_key is not None: conn = S3Connection(self._aws_key, self._aws_secret_key) else: conn = S3Connection() bucket = conn.get_bucket(self._bucket_name) start = datetime.now() count = 0 # Filter input files by partition. If the filter is reasonably # selective, this can be much faster than listing all files in the # bucket. for f in s3util.list_heka_partitions(bucket, schema=self._input_filter): count += 1 if count == 1 or count % 1000 == 0: print "Listed", count, "so far" yield f conn.close() duration = timer.delta_sec(start) print "Listed", count, "files in", duration, "seconds"
def get_filtered_files_s3(self): if not self._local_only: print "Fetching file list from S3..." # Plain boto should be fast enough to list bucket contents. if self._aws_key is not None: conn = S3Connection(self._aws_key, self._aws_secret_key) else: conn = S3Connection() bucket = conn.get_bucket(self._bucket_name) start = datetime.now() count = 0 # Filter input files by partition. If the filter is reasonably # selective, this can be much faster than listing all files in the # bucket. for f in s3util.list_heka_partitions(bucket, schema=self._input_filter): count += 1 if count == 1 or count % 1000 == 0: print "Listed", count, "so far" yield f conn.close() duration = timer.delta_sec(start) print "Listed", count, "files in", duration, "seconds"
def _list_s3_filenames(bucket, prefix, schema): return [ k.name for k in s3u.list_heka_partitions(bucket, prefix, schema=schema) ]
def _list_s3_filenames(bucket, prefix, schema): return [k.name for k in s3u.list_heka_partitions(bucket, prefix, schema=schema)]