def main(args): read_client = KronosClient(args.read_url, namespace=args.read_namespace) write_client = KronosClient(args.write_url, namespace=args.write_namespace, blocking=False) start_time = time.time() time_step = timedelta(seconds=args.copy_period_seconds) for stream in args.stream_file: stream = stream.rstrip() print 'Starting stream', stream, time.time() - start_time start = args.start # Keep track of the last ID we read, so we re-run queries from # there. last_read_id = None while start <= args.end: print '...start is', start, time.time() - start_time end = min(args.end, start + time_step) if last_read_id is None: read_stream = read_client.get(stream, start, end) else: read_stream = read_client.get(stream, None, end, start_id=last_read_id) for event in read_stream: if event[ID_FIELD] != last_read_id: last_read_id = event[ID_FIELD] del event[ID_FIELD] write_client.put({stream: [event]}) start += time_step write_client.flush() print 'Completed stream', stream, time.time() - start_time
def main(args): client = KronosClient(args.kronos_url) results = client.get(args.stream, args.start, args.end, namespace=args.namespace) if args.display == 'print': if args.type == 'json': events = [] for event in results: events.append(event) print json.dumps(events) elif args.type == 'one-per-line': for event in results: print event elif args.display == 'csv': writer = csv.DictWriter(sys.stdout, args.fields) if not args.remove_header: writer.writeheader() for event in results: row_values = {} for field in args.fields: field_value = get_property(event, field) row_values[field] = (field_value.encode('utf-8') if isinstance(field_value, unicode) else field_value) writer.writerow(row_values) elif args.display == 'aggregate': aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator], args.field, args.time_bucket_width) print 'Bucket, Aggregate' for bucket, aggregate in aggregates: print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate) else: raise Exception('Invalid display option {}'.format(args.display))
def main(args): client = KronosClient(args.kronos_url) results = client.get(args.stream, args.start, args.end, namespace=args.namespace) if args.display == 'print': if args.type == 'json': events = [] for event in results: events.append(event) print json.dumps(events) elif args.type == 'one-per-line': for event in results: print event elif args.display == 'csv': writer = csv.DictWriter(sys.stdout, args.fields) if not args.remove_header: writer.writeheader() for event in results: row_values = {} for field in args.fields: field_value = get_property(event, field) row_values[field] = (field_value.encode('utf-8') if isinstance( field_value, unicode) else field_value) writer.writerow(row_values) elif args.display == 'aggregate': aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator], args.field, args.time_bucket_width) print 'Bucket, Aggregate' for bucket, aggregate in aggregates: print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate) else: raise Exception('Invalid display option {}'.format(args.display))
def streams(): kc = KronosClient(current_app.config['KRONOS_URL'], namespace=current_app.config['KRONOS_NAMESPACE']) kstreams = kc.get_streams(namespace=current_app.config['KRONOS_NAMESPACE']) kstreams = sorted(kstreams) return { 'streams': kstreams, }
def execute_kronos_stream(self, node): from pykronos import KronosClient client = KronosClient(node.host, blocking=True) return client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace)
def streams(): client = KronosClient(app.config['KRONOS_URL'], namespace=app.config['KRONOS_NAMESPACE']) kronos_streams = client.get_streams(namespace=app.config['KRONOS_NAMESPACE']) kronos_streams = sorted(kronos_streams) return { 'streams': kronos_streams, }
def execute_kronos_stream(self, node): # TODO(usmanm): Read time slices of events in parallel from worker nodes. from pykronos import KronosClient client = KronosClient(node.host, blocking=True) events = client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace) return self.context.parallelize(events)
def get_events(i): client = KronosClient(node._host, blocking=True) start_time = node.start_time + (i * delta) if i == executor.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list(client.get(node.stream, start_time, end_time, namespace=node.namespace))
def get_events(i): client = KronosClient(node._host, blocking=True) start_time = node.start_time + (i * delta) if i == executor.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list( client.get(node.stream, start_time, end_time, namespace=node.namespace))
def load_test_data(args): donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read())) donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME)) events = [] rows = csv.DictReader(donations) for row in rows: row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt']) events.append(row) kc = KronosClient(args.kronos_url) kc.put({'donations': events})
def get_events(i): from pykronos import KronosClient client = KronosClient(node.host, blocking=True) start_time = node.start_time + (i * delta) if i == self.parallelism - 1: end_time = node.end_time else: end_time = start_time + delta - 1 return list(client.get(node.stream, start_time, end_time, namespace=node.namespace))
def main(args): client = KronosClient(args.kronos_url) headers = [ 'stream', 'total_events', 'events_per_day', 'events_per_sec', 'payload_total_bytes', 'payload_avg_bytes', 'payload_med_bytes', 'payload_95_bytes', 'payload_99_bytes', 'schema', ] if args.csv: csv_file = open(args.csv, 'w') writer = csv.DictWriter(csv_file, headers) writer.writeheader() else: print '-' * 79 for stream in client.get_streams(): total_events = 0 payloads = [] for event in client.get(stream, args.start, args.end): payloads.append(len(ujson.dumps(event))) total_events += 1 if total_events == 0: indent('%s has no events' % stream, 2) print '-' * 79 continue timeframe_sec = (args.end - args.start).total_seconds() schema = client.infer_schema(stream)['schema'] context = dict( zip(headers, [ stream, total_events, (float(total_events) / timeframe_sec) * 60 * 60 * 24, float(total_events) / timeframe_sec, np.sum(payloads), np.mean(payloads), np.median(payloads), np.percentile(payloads, 95), np.percentile(payloads, 99), schema, ])) if args.csv: writer.writerow(context) else: indent(output % context, 2) print '-' * 79
def main(args): client = KronosClient(args.kronos_url) headers = [ 'stream', 'total_events', 'events_per_day', 'events_per_sec', 'payload_total_bytes', 'payload_avg_bytes', 'payload_med_bytes', 'payload_95_bytes', 'payload_99_bytes', 'schema', ] if args.csv: csv_file = open(args.csv, 'w') writer = csv.DictWriter(csv_file, headers) writer.writeheader() else: print '-' * 79 for stream in client.get_streams(): total_events = 0 payloads = [] for event in client.get(stream, args.start, args.end): payloads.append(len(ujson.dumps(event))) total_events += 1 if total_events == 0: indent('%s has no events' % stream, 2) print '-' * 79 continue timeframe_sec = (args.end - args.start).total_seconds() schema = client.infer_schema(stream)['schema'] context = dict(zip(headers, [ stream, total_events, (float(total_events) / timeframe_sec) * 60 * 60 * 24, float(total_events) / timeframe_sec, np.sum(payloads), np.mean(payloads), np.median(payloads), np.percentile(payloads, 95), np.percentile(payloads, 99), schema, ])) if args.csv: writer.writerow(context) else: indent(output % context, 2) print '-' * 79
def _run_query(self, start_time, end_time, unique_id=None): """Executes a Python query string and returns events Acts as a wrapper around exec that injects necessary local variables into the scope of the user-provided query blob. :param start_time: Python datetime to be injected into query :param end_time: Python datetime to be injected into query :param unique_id: An unused flag that allows the scheduler to hash this function uniquely based on its args when it passes through """ client = KronosClient(self._app.config['KRONOS_URL'], namespace=self._app.config['KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) locals_dict = { 'kronos_client': client, 'events': [], 'start_time': start_time, 'end_time': end_time, } try: exec self._query in {}, locals_dict # No globals. except: _, exception, tb = sys.exc_info() raise PyCodeError(exception, traceback.format_tb(tb)) events = sorted(locals_dict.get('events', []), key=lambda event: event['@time']) return events
def _run_query(self, start_time, end_time, unique_id=None): """Executes a Python query string and returns events Acts as a wrapper around exec that injects necessary local variables into the scope of the user-provided query blob. :param start_time: Python datetime to be injected into query :param end_time: Python datetime to be injected into query :param unique_id: An unused flag that allows the scheduler to hash this function uniquely based on its args when it passes through """ # XXX(derek): DEPRECATION WARNING # Use of the implicit Kronos client in pycode queries is deprecated client = KronosClient(self._app.config['KRONOS_URL'], namespace=self._app.config['KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) locals_dict = { 'kronos_client': client, 'events': [], 'start_time': start_time, 'end_time': end_time, } try: exec self._query in {}, locals_dict # No globals. except: _, exception, tb = sys.exc_info() raise PyCodeError(exception, traceback.format_tb(tb)) # Retrieve the `events` variable as computed by the pycode. events = locals_dict.get('events', []) return events
def main(args): client = KronosClient(args.kronos_url) if args.fetch_timeout: start = args.start end = args.end limit = None else: start = 0 end = datetime.utcnow() limit = 1000 for stream in client.get_streams(namespace=args.namespace): if not (args.read_latency or args.fetch_timeout): print stream elif check_stream(client, args.namespace, stream, start, end, limit, args.fetch_timeout, args.read_latency): print stream
def main(args): client = KronosClient(args.kronos_url, namespace=args.namespace) unfiltered_streams = [(stream, None, args.user_field) for stream in args.streams] stream_sizes = funnel_analyze(client, unfiltered_streams, args.start, args.end, args.end, {}, None) # TODO(marcua): print something more meaningful here. print stream_sizes
def setUp(self): self.client = KronosClient('http://localhost:9191/', blocking=False, sleep_block=0.2) self.total_events = 500 self.computed_namespace = 'computed' self.increment = timedelta(minutes=1) self.start_time = datetime(2014, 6, 4, 22) self.bucket_width = timedelta(minutes=20)
def setUp(self): self.kronos_client = KronosClient('http://localhost:9191') self.index_path = '1.0/index' self.source_path = '1.0/sources' self.streams_path = '1.0/streams/kronos' self.schema_path = '1.0/streams/kronos/%s' self.query_path = '1.0/query' self.server_url = 'http://localhost:9192/%s' self.executor = None
def main(args): client = KronosClient(args.kronos_url, namespace=args.namespace, blocking=False) increment = timedelta(microseconds=args.microseconds_between_events) event = {'property%s' % (idx): idx for idx in xrange(args.properties_per_event)} start_time = time.time() for idx in xrange(args.num_events): event[TIMESTAMP_FIELD] = args.start + (idx * increment) client.put({args.stream: [event]}) if (idx % args.chunk_size) == 0: print 'Completed', idx, 'events', time.time() - start_time client.flush() client.flush()
def setUp(self): self.kronos_client = KronosClient('http://localhost:9191') self.index_path = '1.0/index' self.query_path = '1.0/query' self.server_url = 'http://localhost:9192/%s' self.executor = None
def execute(self, node, executor): client = KronosClient(node._host, blocking=True) return client.get(node.stream, node.start_time, node.end_time, namespace=node.namespace)
def setUp(self): self.blocking_client = KronosClient('http://localhost:9191/', blocking=True) self.nonblocking_client = KronosClient('http://localhost:9191/', blocking=False, sleep_block=0.2)
def infer_schema(stream_name=None): kc = KronosClient(current_app.config['KRONOS_URL'], namespace=current_app.config['KRONOS_NAMESPACE']) schema = kc.infer_schema(stream_name, namespace=current_app.config['KRONOS_NAMESPACE']) return schema
def __init__(self, query, timeframe, bucket_width=None, untrusted_time=None, metis=False): """Initialize QueryCompute :param query: A string of python code to execute as a Jia query. :param timeframe: A timeframe dictionary. It specifies a mode, which can be 'recent' or 'range'. Depending on which mode is selected, some of the other parameters will be unused. The unused parameters come from the frontend for the purposes of storing default/previous values. If the mode is recent, only 'value' and 'scale' are used. If the mode is 'range', only 'from' and 'to' are used. Example timeframe: timeframe = { 'mode': 'recent', 'value': 1, 'scale': 'days', 'from': 'Sat Jun 10 2014 00:00:00', 'to': 'Sun Jun 11 2014 00:00:00', } :param bucket_width: Optional bucket width in seconds :param untrusted_time: Optional untrusted time interval in seconds :param metis: Send `query` to metis for computation """ try: self._app = current_app self._app.config # The above line won't fail, but this one will except RuntimeError: from scheduler import get_app self._app = get_app() self._query = query self._bucket_width = bucket_width self._untrusted_time = untrusted_time self._metis = metis self._start_time, self._end_time = self._get_timeframe_bounds(timeframe, bucket_width) self._cache_client = KronosClient( self._app.config['CACHE_KRONOS_URL'], namespace=self._app.config['CACHE_KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) # The query is sent through as an unused unique_id argument so that the # QueryCache hash can properly uniquely identify it unique = { 'unique_id': self._query } if self._metis: query_func = self._run_metis elif self._app.config['ALLOW_PYCODE']: query_func = self._run_query else: raise ValueError("`metis` must be `True` if ALLOW_PYCODE is not enabled") if self._bucket_width: bucket_width_timedelta = datetime.timedelta(seconds=bucket_width) self._query_cache = QueryCache(self._cache_client, query_func, bucket_width_timedelta, self._app.config['CACHE_KRONOS_NAMESPACE'], query_function_kwargs=unique)
def infer_schema(stream_name=None): client = KronosClient(app.config['KRONOS_URL'], namespace=app.config['KRONOS_NAMESPACE']) schema = client.infer_schema(stream_name, namespace=app.config['KRONOS_NAMESPACE']) return schema
def main(args): client1 = KronosClient(args.kronos_url1, namespace=args.namespace1) client2 = KronosClient(args.kronos_url2, namespace=args.namespace2) if args.streams_file: streams = map( lambda s: (s, s), # Use same stream name for both. filter(lambda s: len(s), open(args.streams_file).read().split('\n'))) else: streams = [(args.stream1, args.stream2)] for stream1_name, stream2_name in streams: if args.num_samples: samples = [] for _ in xrange(args.num_samples): start = random.randint(args.start, args.end - args.sample_interval) samples.append((start, start + args.sample_interval)) else: samples = [(args.start, args.end)] total_stream1 = 0 extra_stream1 = 0 total_stream2 = 0 extra_stream2 = 0 for start, end in samples: stream1 = client1.get(stream1_name, start, end) stream2 = client2.get(stream2_name, start, end) # Sorting of events with the same timestamp may vary across backends, # hence we can't do a simple loop comparison. We need to aggregate all # events with the same timestamp from both streams and then compare the # two sets. stream1_hashes = set() stream2_hashes = set() current_timestamp = None while True: event1 = get_next(stream1) event2 = get_next(stream2) # Are both streams exhausted? if not (event1 or event2): break # Pick the smaller timestamp from the two events. min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint), event2.get(TIMESTAMP_FIELD, sys.maxint)) if current_timestamp is None: current_timestamp = min_timestamp # If min_timestamp is greater than current_timestamp, then aggregate # stats for current_timestamp and roll over. if min_timestamp > current_timestamp: total_stream1 += len(stream1_hashes) total_stream2 += len(stream2_hashes) extra_stream1 += len(stream1_hashes - stream2_hashes) extra_stream2 += len(stream2_hashes - stream1_hashes) stream1_hashes.clear() stream2_hashes.clear() current_timestamp = min_timestamp if event1: assert event1[TIMESTAMP_FIELD] >= current_timestamp if event1[TIMESTAMP_FIELD] == current_timestamp: del event1[ID_FIELD] stream1_hashes.add( hashlib.sha224(json.dumps( event1, sort_keys=True)).hexdigest()) else: stream1 = push_back(event1, stream1) if event2: assert event2[TIMESTAMP_FIELD] >= current_timestamp if event2[TIMESTAMP_FIELD] == current_timestamp: del event2[ID_FIELD] stream2_hashes.add( hashlib.sha224(json.dumps( event2, sort_keys=True)).hexdigest()) else: stream2 = push_back(event2, stream2) print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name, args.namespace2, stream2_name) print '< total: %d' % total_stream1 print '> total: %d' % total_stream2 print '< extra: %d' % extra_stream1 print '> extra: %d' % extra_stream2 print
import time import uuid from kronos.common.time import epoch_time_to_kronos_time from kronos.conf.constants import TIMESTAMP_FIELD from pykronos import KronosClient kronos = KronosClient('http://localhost:9191/', blocking=True) def timeit(desc, func, *args, **kwargs): start = time.time() func(*args, **kwargs) print ' - %s took %ss' % (desc, time.time() - start) def _make_event(t): return { TIMESTAMP_FIELD: epoch_time_to_kronos_time(t), 'property1': str(uuid.uuid4()), 'property2': str(uuid.uuid4()), 'property3': str(uuid.uuid4()) } def insert(stream, n, chunk_size=10000): for _ in xrange(0, n, chunk_size): events = [] for t in xrange(0, chunk_size): events.append(_make_event(t % 1000)) kronos.put({stream: events})
class QueryCacheTest(unittest.TestCase): def setUp(self): self.client = KronosClient('http://localhost:9191/', blocking=False, sleep_block=0.2) self.total_events = 500 self.computed_namespace = 'computed' self.increment = timedelta(minutes=1) self.start_time = datetime(2014, 6, 4, 22) self.bucket_width = timedelta(minutes=20) def compute_cache_test(function): """A wrapper that sets up a stream with test data. The stream takes the name of the function being run, and contains `self.total_events` events. The events are each one `self.increment` apart. """ @functools.wraps(function) def wrapper(self): self.stream = 'ComputeCacheTest_%s' % (function.__name__) for i in xrange(self.total_events): self.client.put({ self.stream: [{TIMESTAMP_FIELD: self.start_time + (self.increment * i), 'a': i % 5, 'b': i}]}) self.client.flush() function(self) return wrapper def filter_and_sum(self, start_time, end_time): """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2. For all events between `start_time` and `end_time`, create an event for every 20-minute interval of events that contains the sum of `b` when `a`==2. """ events = self.client.get(self.stream, start_time, end_time) counts = defaultdict(int) grouping_minutes = timedelta_to_kronos_time(self.bucket_width) for event in events: if event['a'] == 2: counts[event['@time'] - (event['@time'] % grouping_minutes)] += event['b'] for group_time in sorted(counts.iterkeys()): yield {'@time': group_time, 'b_sum': counts[group_time]} def verify_results(self, result_func, cache, expected_results, expected_computations): with patch.object(cache, '_compute_bucket', wraps=cache._compute_bucket) as mock_method: results = result_func() self.assertEqual(mock_method.call_count, expected_computations) self.assertEqual(len(results), expected_results) result_time = self.start_time for idx, result in enumerate(results): self.assertEqual(result[TIMESTAMP_FIELD], datetime_to_kronos_time(result_time)) self.assertEqual( result['b_sum'], sum([2, 7, 12, 17] + [idx * 4 * (self.bucket_width.total_seconds() / 60)])) result_time += self.bucket_width def test_cache_exceptions(self): # Bucket width shouldn't be more granular than 1 second. def bad_bucket_width(): return QueryCache(self.client, self.filter_and_sum, self.bucket_width + timedelta(milliseconds=1), self.computed_namespace) self.assertRaises(ValueError, bad_bucket_width) # start_time and end_time should align to bucket_width boundaries. cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) def bad_start_boundary(): return list( cache.retrieve_interval(start_time + timedelta(minutes=1), end_time)) self.assertRaises(ValueError, bad_start_boundary) @compute_cache_test def test_cache_layer(self): cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) untrusted_time = self.start_time + ( timedelta(minutes=(self.total_events / 2) - 25)) # Verify all results were computed correctly. self.verify_results(lambda: list( cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 31) # Verify only trusted results are cached. self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Running the same operations twice should result in the same # results as before. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Expanding the time range without caching should also result in the same # results self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width)), cache, 11, 0) # But specifying compute_missing should get all results for the timerange self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width, compute_missing=True)), cache, 25, 19) # Overlapping time queries should result in the same # results as before, and benefit from the cache. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time - self.bucket_width, end_time + self.bucket_width, untrusted_time)), cache, 25, 19) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Increasing the trusted time should increase the cached results. untrusted_time = untrusted_time + timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Decreasing trusted time shouldn't remove results. untrusted_time = untrusted_time - timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 15) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # If there are two cached entries, that cached time should no # longer be returned. results = list(cache.retrieve_interval(start_time, end_time)) duplicate_result = dict(results[10]) duplicate_result['b_sum'] = 0 self.client.put({cache._scratch_stream: [duplicate_result]}, namespace=cache._scratch_namespace) self.client.flush() safe_results = list(cache.retrieve_interval(start_time, end_time)) self.assertEqual(results[:10] + results[11:], safe_results) # Rerunning the cache/computation should re-cache the corrupted # element. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 16) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Forcing computation should generate the same result set. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets( start_time, end_time, untrusted_time, force_recompute=True)), cache, 25, 31) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0)
def main(args): client1 = KronosClient(args.kronos_url1, namespace=args.namespace1) client2 = KronosClient(args.kronos_url2, namespace=args.namespace2) if args.streams_file: streams = map(lambda s: (s, s), # Use same stream name for both. filter(lambda s: len(s), open(args.streams_file).read().split('\n'))) else: streams = [(args.stream1, args.stream2)] for stream1_name, stream2_name in streams: if args.num_samples: samples = [] for _ in xrange(args.num_samples): start = random.randint(args.start, args.end - args.sample_interval) samples.append((start, start + args.sample_interval)) else: samples = [(args.start, args.end)] total_stream1 = 0 extra_stream1 = 0 total_stream2 = 0 extra_stream2 = 0 for start, end in samples: stream1 = client1.get(stream1_name, start, end) stream2 = client2.get(stream2_name, start, end) # Sorting of events with the same timestamp may vary across backends, # hence we can't do a simple loop comparison. We need to aggregate all # events with the same timestamp from both streams and then compare the # two sets. stream1_hashes = set() stream2_hashes = set() current_timestamp = None while True: event1 = get_next(stream1) event2 = get_next(stream2) # Are both streams exhausted? if not (event1 or event2): break # Pick the smaller timestamp from the two events. min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint), event2.get(TIMESTAMP_FIELD, sys.maxint)) if current_timestamp is None: current_timestamp = min_timestamp # If min_timestamp is greater than current_timestamp, then aggregate # stats for current_timestamp and roll over. if min_timestamp > current_timestamp: total_stream1 += len(stream1_hashes) total_stream2 += len(stream2_hashes) extra_stream1 += len(stream1_hashes - stream2_hashes) extra_stream2 += len(stream2_hashes - stream1_hashes) stream1_hashes.clear() stream2_hashes.clear() current_timestamp = min_timestamp if event1: assert event1[TIMESTAMP_FIELD] >= current_timestamp if event1[TIMESTAMP_FIELD] == current_timestamp: del event1[ID_FIELD] stream1_hashes.add( hashlib.sha224(json.dumps(event1, sort_keys=True)).hexdigest()) else: stream1 = push_back(event1, stream1) if event2: assert event2[TIMESTAMP_FIELD] >= current_timestamp if event2[TIMESTAMP_FIELD] == current_timestamp: del event2[ID_FIELD] stream2_hashes.add( hashlib.sha224(json.dumps(event2, sort_keys=True)).hexdigest()) else: stream2 = push_back(event2, stream2) print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name, args.namespace2, stream2_name) print '< total: %d' % total_stream1 print '> total: %d' % total_stream2 print '< extra: %d' % extra_stream1 print '> extra: %d' % extra_stream2 print