def main(args): read_client = KronosClient(args.read_url, namespace=args.read_namespace) write_client = KronosClient(args.write_url, namespace=args.write_namespace, blocking=False) start_time = time.time() time_step = timedelta(seconds=args.copy_period_seconds) for stream in args.stream_file: stream = stream.rstrip() print 'Starting stream', stream, time.time() - start_time start = args.start # Keep track of the last ID we read, so we re-run queries from # there. last_read_id = None while start <= args.end: print '...start is', start, time.time() - start_time end = min(args.end, start + time_step) if last_read_id is None: read_stream = read_client.get(stream, start, end) else: read_stream = read_client.get(stream, None, end, start_id=last_read_id) for event in read_stream: if event[ID_FIELD] != last_read_id: last_read_id = event[ID_FIELD] del event[ID_FIELD] write_client.put({stream: [event]}) start += time_step write_client.flush() print 'Completed stream', stream, time.time() - start_time
def main(args): client = KronosClient(args.kronos_url) results = client.get(args.stream, args.start, args.end, namespace=args.namespace) if args.display == 'print': if args.type == 'json': events = [] for event in results: events.append(event) print json.dumps(events) elif args.type == 'one-per-line': for event in results: print event elif args.display == 'csv': writer = csv.DictWriter(sys.stdout, args.fields) if not args.remove_header: writer.writeheader() for event in results: row_values = {} for field in args.fields: field_value = get_property(event, field) row_values[field] = (field_value.encode('utf-8') if isinstance( field_value, unicode) else field_value) writer.writerow(row_values) elif args.display == 'aggregate': aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator], args.field, args.time_bucket_width) print 'Bucket, Aggregate' for bucket, aggregate in aggregates: print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate) else: raise Exception('Invalid display option {}'.format(args.display))
def main(args): client = KronosClient(args.kronos_url) results = client.get(args.stream, args.start, args.end, namespace=args.namespace) if args.display == 'print': if args.type == 'json': events = [] for event in results: events.append(event) print json.dumps(events) elif args.type == 'one-per-line': for event in results: print event elif args.display == 'csv': writer = csv.DictWriter(sys.stdout, args.fields) if not args.remove_header: writer.writeheader() for event in results: row_values = {} for field in args.fields: field_value = get_property(event, field) row_values[field] = (field_value.encode('utf-8') if isinstance(field_value, unicode) else field_value) writer.writerow(row_values) elif args.display == 'aggregate': aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator], args.field, args.time_bucket_width) print 'Bucket, Aggregate' for bucket, aggregate in aggregates: print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate) else: raise Exception('Invalid display option {}'.format(args.display))
def execute_kronos_stream(self, node): from pykronos import KronosClient client = KronosClient(node.host, blocking=True) return client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace)
def execute_kronos_stream(self, node): # TODO(usmanm): Read time slices of events in parallel from worker nodes. from pykronos import KronosClient client = KronosClient(node.host, blocking=True) events = client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace) return self.context.parallelize(events)
def get_events(i): client = KronosClient(node._host, blocking=True) start_time = node.start_time + (i * delta) if i == executor.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list(client.get(node.stream, start_time, end_time, namespace=node.namespace))
def get_events(i): client = KronosClient(node._host, blocking=True) start_time = node.start_time + (i * delta) if i == executor.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list( client.get(node.stream, start_time, end_time, namespace=node.namespace))
def get_events(i): from pykronos import KronosClient client = KronosClient(node.host, blocking=True) start_time = node.start_time + (i * delta) if i == self.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list(client.get(node.stream, start_time, end_time, namespace=node.namespace))
def main(args): client = KronosClient(args.kronos_url) headers = [ 'stream', 'total_events', 'events_per_day', 'events_per_sec', 'payload_total_bytes', 'payload_avg_bytes', 'payload_med_bytes', 'payload_95_bytes', 'payload_99_bytes', 'schema', ] if args.csv: csv_file = open(args.csv, 'w') writer = csv.DictWriter(csv_file, headers) writer.writeheader() else: print '-' * 79 for stream in client.get_streams(): total_events = 0 payloads = [] for event in client.get(stream, args.start, args.end): payloads.append(len(ujson.dumps(event))) total_events += 1 if total_events == 0: indent('%s has no events' % stream, 2) print '-' * 79 continue timeframe_sec = (args.end - args.start).total_seconds() schema = client.infer_schema(stream)['schema'] context = dict( zip(headers, [ stream, total_events, (float(total_events) / timeframe_sec) * 60 * 60 * 24, float(total_events) / timeframe_sec, np.sum(payloads), np.mean(payloads), np.median(payloads), np.percentile(payloads, 95), np.percentile(payloads, 99), schema, ])) if args.csv: writer.writerow(context) else: indent(output % context, 2) print '-' * 79
def main(args): client = KronosClient(args.kronos_url) headers = [ 'stream', 'total_events', 'events_per_day', 'events_per_sec', 'payload_total_bytes', 'payload_avg_bytes', 'payload_med_bytes', 'payload_95_bytes', 'payload_99_bytes', 'schema', ] if args.csv: csv_file = open(args.csv, 'w') writer = csv.DictWriter(csv_file, headers) writer.writeheader() else: print '-' * 79 for stream in client.get_streams(): total_events = 0 payloads = [] for event in client.get(stream, args.start, args.end): payloads.append(len(ujson.dumps(event))) total_events += 1 if total_events == 0: indent('%s has no events' % stream, 2) print '-' * 79 continue timeframe_sec = (args.end - args.start).total_seconds() schema = client.infer_schema(stream)['schema'] context = dict(zip(headers, [ stream, total_events, (float(total_events) / timeframe_sec) * 60 * 60 * 24, float(total_events) / timeframe_sec, np.sum(payloads), np.mean(payloads), np.median(payloads), np.percentile(payloads, 95), np.percentile(payloads, 99), schema, ])) if args.csv: writer.writerow(context) else: indent(output % context, 2) print '-' * 79
def execute(self, node, executor): client = KronosClient(node._host, blocking=True) return client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace)
def main(args): client1 = KronosClient(args.kronos_url1, namespace=args.namespace1) client2 = KronosClient(args.kronos_url2, namespace=args.namespace2) if args.streams_file: streams = map( lambda s: (s, s), # Use same stream name for both. filter(lambda s: len(s), open(args.streams_file).read().split('\n'))) else: streams = [(args.stream1, args.stream2)] for stream1_name, stream2_name in streams: if args.num_samples: samples = [] for _ in xrange(args.num_samples): start = random.randint(args.start, args.end - args.sample_interval) samples.append((start, start + args.sample_interval)) else: samples = [(args.start, args.end)] total_stream1 = 0 extra_stream1 = 0 total_stream2 = 0 extra_stream2 = 0 for start, end in samples: stream1 = client1.get(stream1_name, start, end) stream2 = client2.get(stream2_name, start, end) # Sorting of events with the same timestamp may vary across backends, # hence we can't do a simple loop comparison. We need to aggregate all # events with the same timestamp from both streams and then compare the # two sets. stream1_hashes = set() stream2_hashes = set() current_timestamp = None while True: event1 = get_next(stream1) event2 = get_next(stream2) # Are both streams exhausted? if not (event1 or event2): break # Pick the smaller timestamp from the two events. min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint), event2.get(TIMESTAMP_FIELD, sys.maxint)) if current_timestamp is None: current_timestamp = min_timestamp # If min_timestamp is greater than current_timestamp, then aggregate # stats for current_timestamp and roll over. if min_timestamp > current_timestamp: total_stream1 += len(stream1_hashes) total_stream2 += len(stream2_hashes) extra_stream1 += len(stream1_hashes - stream2_hashes) extra_stream2 += len(stream2_hashes - stream1_hashes) stream1_hashes.clear() stream2_hashes.clear() current_timestamp = min_timestamp if event1: assert event1[TIMESTAMP_FIELD] >= current_timestamp if event1[TIMESTAMP_FIELD] == current_timestamp: del event1[ID_FIELD] stream1_hashes.add( hashlib.sha224(json.dumps( event1, sort_keys=True)).hexdigest()) else: stream1 = push_back(event1, stream1) if event2: assert event2[TIMESTAMP_FIELD] >= current_timestamp if event2[TIMESTAMP_FIELD] == current_timestamp: del event2[ID_FIELD] stream2_hashes.add( hashlib.sha224(json.dumps( event2, sort_keys=True)).hexdigest()) else: stream2 = push_back(event2, stream2) print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name, args.namespace2, stream2_name) print '< total: %d' % total_stream1 print '> total: %d' % total_stream2 print '< extra: %d' % extra_stream1 print '> extra: %d' % extra_stream2 print
def main(args): client1 = KronosClient(args.kronos_url1, namespace=args.namespace1) client2 = KronosClient(args.kronos_url2, namespace=args.namespace2) if args.streams_file: streams = map(lambda s: (s, s), # Use same stream name for both. filter(lambda s: len(s), open(args.streams_file).read().split('\n'))) else: streams = [(args.stream1, args.stream2)] for stream1_name, stream2_name in streams: if args.num_samples: samples = [] for _ in xrange(args.num_samples): start = random.randint(args.start, args.end - args.sample_interval) samples.append((start, start + args.sample_interval)) else: samples = [(args.start, args.end)] total_stream1 = 0 extra_stream1 = 0 total_stream2 = 0 extra_stream2 = 0 for start, end in samples: stream1 = client1.get(stream1_name, start, end) stream2 = client2.get(stream2_name, start, end) # Sorting of events with the same timestamp may vary across backends, # hence we can't do a simple loop comparison. We need to aggregate all # events with the same timestamp from both streams and then compare the # two sets. stream1_hashes = set() stream2_hashes = set() current_timestamp = None while True: event1 = get_next(stream1) event2 = get_next(stream2) # Are both streams exhausted? if not (event1 or event2): break # Pick the smaller timestamp from the two events. min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint), event2.get(TIMESTAMP_FIELD, sys.maxint)) if current_timestamp is None: current_timestamp = min_timestamp # If min_timestamp is greater than current_timestamp, then aggregate # stats for current_timestamp and roll over. if min_timestamp > current_timestamp: total_stream1 += len(stream1_hashes) total_stream2 += len(stream2_hashes) extra_stream1 += len(stream1_hashes - stream2_hashes) extra_stream2 += len(stream2_hashes - stream1_hashes) stream1_hashes.clear() stream2_hashes.clear() current_timestamp = min_timestamp if event1: assert event1[TIMESTAMP_FIELD] >= current_timestamp if event1[TIMESTAMP_FIELD] == current_timestamp: del event1[ID_FIELD] stream1_hashes.add( hashlib.sha224(json.dumps(event1, sort_keys=True)).hexdigest()) else: stream1 = push_back(event1, stream1) if event2: assert event2[TIMESTAMP_FIELD] >= current_timestamp if event2[TIMESTAMP_FIELD] == current_timestamp: del event2[ID_FIELD] stream2_hashes.add( hashlib.sha224(json.dumps(event2, sort_keys=True)).hexdigest()) else: stream2 = push_back(event2, stream2) print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name, args.namespace2, stream2_name) print '< total: %d' % total_stream1 print '> total: %d' % total_stream2 print '< extra: %d' % extra_stream1 print '> extra: %d' % extra_stream2 print
class QueryCacheTest(unittest.TestCase): def setUp(self): self.client = KronosClient('http://localhost:9191/', blocking=False, sleep_block=0.2) self.total_events = 500 self.computed_namespace = 'computed' self.increment = timedelta(minutes=1) self.start_time = datetime(2014, 6, 4, 22) self.bucket_width = timedelta(minutes=20) def compute_cache_test(function): """A wrapper that sets up a stream with test data. The stream takes the name of the function being run, and contains `self.total_events` events. The events are each one `self.increment` apart. """ @functools.wraps(function) def wrapper(self): self.stream = 'ComputeCacheTest_%s' % (function.__name__) for i in xrange(self.total_events): self.client.put({ self.stream: [{TIMESTAMP_FIELD: self.start_time + (self.increment * i), 'a': i % 5, 'b': i}]}) self.client.flush() function(self) return wrapper def filter_and_sum(self, start_time, end_time): """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2. For all events between `start_time` and `end_time`, create an event for every 20-minute interval of events that contains the sum of `b` when `a`==2. """ events = self.client.get(self.stream, start_time, end_time) counts = defaultdict(int) grouping_minutes = timedelta_to_kronos_time(self.bucket_width) for event in events: if event['a'] == 2: counts[event['@time'] - (event['@time'] % grouping_minutes)] += event['b'] for group_time in sorted(counts.iterkeys()): yield {'@time': group_time, 'b_sum': counts[group_time]} def verify_results(self, result_func, cache, expected_results, expected_computations): with patch.object(cache, '_compute_bucket', wraps=cache._compute_bucket) as mock_method: results = result_func() self.assertEqual(mock_method.call_count, expected_computations) self.assertEqual(len(results), expected_results) result_time = self.start_time for idx, result in enumerate(results): self.assertEqual(result[TIMESTAMP_FIELD], datetime_to_kronos_time(result_time)) self.assertEqual( result['b_sum'], sum([2, 7, 12, 17] + [idx * 4 * (self.bucket_width.total_seconds() / 60)])) result_time += self.bucket_width def test_cache_exceptions(self): # Bucket width shouldn't be more granular than 1 second. def bad_bucket_width(): return QueryCache(self.client, self.filter_and_sum, self.bucket_width + timedelta(milliseconds=1), self.computed_namespace) self.assertRaises(ValueError, bad_bucket_width) # start_time and end_time should align to bucket_width boundaries. cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) def bad_start_boundary(): return list( cache.retrieve_interval(start_time + timedelta(minutes=1), end_time)) self.assertRaises(ValueError, bad_start_boundary) @compute_cache_test def test_cache_layer(self): cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) untrusted_time = self.start_time + ( timedelta(minutes=(self.total_events / 2) - 25)) # Verify all results were computed correctly. self.verify_results(lambda: list( cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 31) # Verify only trusted results are cached. self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Running the same operations twice should result in the same # results as before. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Expanding the time range without caching should also result in the same # results self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width)), cache, 11, 0) # But specifying compute_missing should get all results for the timerange self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width, compute_missing=True)), cache, 25, 19) # Overlapping time queries should result in the same # results as before, and benefit from the cache. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time - self.bucket_width, end_time + self.bucket_width, untrusted_time)), cache, 25, 19) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Increasing the trusted time should increase the cached results. untrusted_time = untrusted_time + timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Decreasing trusted time shouldn't remove results. untrusted_time = untrusted_time - timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 15) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # If there are two cached entries, that cached time should no # longer be returned. results = list(cache.retrieve_interval(start_time, end_time)) duplicate_result = dict(results[10]) duplicate_result['b_sum'] = 0 self.client.put({cache._scratch_stream: [duplicate_result]}, namespace=cache._scratch_namespace) self.client.flush() safe_results = list(cache.retrieve_interval(start_time, end_time)) self.assertEqual(results[:10] + results[11:], safe_results) # Rerunning the cache/computation should re-cache the corrupted # element. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 16) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Forcing computation should generate the same result set. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets( start_time, end_time, untrusted_time, force_recompute=True)), cache, 25, 31) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0)