Пример #1
0
def main(args):
  read_client = KronosClient(args.read_url, namespace=args.read_namespace)
  write_client = KronosClient(args.write_url, namespace=args.write_namespace,
                              blocking=False)
  start_time = time.time()
  time_step = timedelta(seconds=args.copy_period_seconds)
  for stream in args.stream_file:
    stream = stream.rstrip()
    print 'Starting stream', stream, time.time() - start_time
    start = args.start
    # Keep track of the last ID we read, so we re-run queries from
    # there.
    last_read_id = None
    while start <= args.end:
      print '...start is', start, time.time() - start_time
      end = min(args.end, start + time_step)
      if last_read_id is None:
        read_stream = read_client.get(stream, start, end)
      else:
        read_stream = read_client.get(stream, None, end, start_id=last_read_id)
      for event in read_stream:
        if event[ID_FIELD] != last_read_id:
          last_read_id = event[ID_FIELD]
          del event[ID_FIELD]
          write_client.put({stream: [event]})
      start += time_step
      write_client.flush()
    print 'Completed stream', stream, time.time() - start_time    
Пример #2
0
def main(args):
    read_client = KronosClient(args.read_url, namespace=args.read_namespace)
    write_client = KronosClient(args.write_url,
                                namespace=args.write_namespace,
                                blocking=False)
    start_time = time.time()
    time_step = timedelta(seconds=args.copy_period_seconds)
    for stream in args.stream_file:
        stream = stream.rstrip()
        print 'Starting stream', stream, time.time() - start_time
        start = args.start
        # Keep track of the last ID we read, so we re-run queries from
        # there.
        last_read_id = None
        while start <= args.end:
            print '...start is', start, time.time() - start_time
            end = min(args.end, start + time_step)
            if last_read_id is None:
                read_stream = read_client.get(stream, start, end)
            else:
                read_stream = read_client.get(stream,
                                              None,
                                              end,
                                              start_id=last_read_id)
            for event in read_stream:
                if event[ID_FIELD] != last_read_id:
                    last_read_id = event[ID_FIELD]
                    del event[ID_FIELD]
                    write_client.put({stream: [event]})
            start += time_step
            write_client.flush()
        print 'Completed stream', stream, time.time() - start_time
Пример #3
0
def main(args):
    client = KronosClient(args.kronos_url)
    results = client.get(args.stream,
                         args.start,
                         args.end,
                         namespace=args.namespace)
    if args.display == 'print':
        if args.type == 'json':
            events = []
            for event in results:
                events.append(event)
            print json.dumps(events)
        elif args.type == 'one-per-line':
            for event in results:
                print event
    elif args.display == 'csv':
        writer = csv.DictWriter(sys.stdout, args.fields)
        if not args.remove_header:
            writer.writeheader()
        for event in results:
            row_values = {}
            for field in args.fields:
                field_value = get_property(event, field)
                row_values[field] = (field_value.encode('utf-8') if isinstance(
                    field_value, unicode) else field_value)
            writer.writerow(row_values)
    elif args.display == 'aggregate':
        aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator],
                                      args.field, args.time_bucket_width)
        print 'Bucket, Aggregate'
        for bucket, aggregate in aggregates:
            print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate)
    else:
        raise Exception('Invalid display option {}'.format(args.display))
Пример #4
0
def main(args):
  client = KronosClient(args.kronos_url)
  results = client.get(args.stream, args.start, args.end,
                       namespace=args.namespace)
  if args.display == 'print':
    if args.type == 'json':
      events = []
      for event in results:
        events.append(event)
      print json.dumps(events)
    elif args.type == 'one-per-line':
      for event in results:
        print event
  elif args.display == 'csv':
    writer = csv.DictWriter(sys.stdout, args.fields)
    if not args.remove_header:
      writer.writeheader()
    for event in results:
      row_values = {}
      for field in args.fields:
        field_value = get_property(event, field)
        row_values[field] = (field_value.encode('utf-8')
                             if isinstance(field_value, unicode)
                             else field_value)
      writer.writerow(row_values)
  elif args.display == 'aggregate':
    aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator],
                                  args.field, args.time_bucket_width)
    print 'Bucket, Aggregate'
    for bucket, aggregate in aggregates:
      print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate)
  else:
    raise Exception('Invalid display option {}'.format(args.display))
Пример #5
0
    def execute_kronos_stream(self, node):
        from pykronos import KronosClient

        client = KronosClient(node.host, blocking=True)
        return client.get(node.stream,
                          node.start_time,
                          node.end_time,
                          namespace=node.namespace)
Пример #6
0
  def execute_kronos_stream(self, node):
    from pykronos import KronosClient

    client = KronosClient(node.host, blocking=True)
    return client.get(node.stream,
                      node.start_time,
                      node.end_time,
                      namespace=node.namespace)
Пример #7
0
  def execute_kronos_stream(self, node):
    # TODO(usmanm): Read time slices of events in parallel from worker nodes.
    from pykronos import KronosClient

    client = KronosClient(node.host, blocking=True)
    events = client.get(node.stream,
                        node.start_time,
                        node.end_time,
                        namespace=node.namespace)
    return self.context.parallelize(events)
Пример #8
0
 def get_events(i):
   client = KronosClient(node._host, blocking=True)
   start_time = node.start_time + (i * delta)
   if i == executor.parallelism - 1:
     end_time = node.end_time
   else:
     end_time = start_time + delta - 1
   return list(client.get(node.stream,
                          start_time,
                          end_time,
                          namespace=node.namespace))
Пример #9
0
 def get_events(i):
     client = KronosClient(node._host, blocking=True)
     start_time = node.start_time + (i * delta)
     if i == executor.parallelism - 1:
         end_time = node.end_time
     else:
         end_time = start_time + delta - 1
     return list(
         client.get(node.stream,
                    start_time,
                    end_time,
                    namespace=node.namespace))
Пример #10
0
 def get_events(i):
   from pykronos import KronosClient
   client = KronosClient(node.host, blocking=True)
   start_time = node.start_time + (i * delta)
   if i == self.parallelism - 1:
     end_time = node.end_time
   else:
     end_time = start_time + delta - 1
   return list(client.get(node.stream,
                          start_time,
                          end_time,
                          namespace=node.namespace))
Пример #11
0
def main(args):
    client = KronosClient(args.kronos_url)
    headers = [
        'stream',
        'total_events',
        'events_per_day',
        'events_per_sec',
        'payload_total_bytes',
        'payload_avg_bytes',
        'payload_med_bytes',
        'payload_95_bytes',
        'payload_99_bytes',
        'schema',
    ]
    if args.csv:
        csv_file = open(args.csv, 'w')
        writer = csv.DictWriter(csv_file, headers)
        writer.writeheader()
    else:
        print '-' * 79
    for stream in client.get_streams():
        total_events = 0
        payloads = []
        for event in client.get(stream, args.start, args.end):
            payloads.append(len(ujson.dumps(event)))
            total_events += 1
        if total_events == 0:
            indent('%s has no events' % stream, 2)
            print '-' * 79
            continue
        timeframe_sec = (args.end - args.start).total_seconds()
        schema = client.infer_schema(stream)['schema']
        context = dict(
            zip(headers, [
                stream,
                total_events,
                (float(total_events) / timeframe_sec) * 60 * 60 * 24,
                float(total_events) / timeframe_sec,
                np.sum(payloads),
                np.mean(payloads),
                np.median(payloads),
                np.percentile(payloads, 95),
                np.percentile(payloads, 99),
                schema,
            ]))
        if args.csv:
            writer.writerow(context)
        else:
            indent(output % context, 2)
            print '-' * 79
def main(args):
  client = KronosClient(args.kronos_url)
  headers = [
    'stream',
    'total_events',
    'events_per_day',
    'events_per_sec',
    'payload_total_bytes',
    'payload_avg_bytes',
    'payload_med_bytes',
    'payload_95_bytes',
    'payload_99_bytes',
    'schema',
  ]
  if args.csv:
    csv_file = open(args.csv, 'w')
    writer = csv.DictWriter(csv_file, headers)
    writer.writeheader()
  else:
    print '-' * 79
  for stream in client.get_streams():
    total_events = 0
    payloads = []
    for event in client.get(stream, args.start, args.end):
      payloads.append(len(ujson.dumps(event)))
      total_events += 1
    if total_events == 0:
      indent('%s has no events' % stream, 2)
      print '-' * 79
      continue
    timeframe_sec = (args.end - args.start).total_seconds()
    schema = client.infer_schema(stream)['schema']
    context = dict(zip(headers, [
      stream,
      total_events,
      (float(total_events) / timeframe_sec) * 60 * 60 * 24,
      float(total_events) / timeframe_sec,
      np.sum(payloads),
      np.mean(payloads),
      np.median(payloads),
      np.percentile(payloads, 95),
      np.percentile(payloads, 99),
      schema,
    ]))
    if args.csv:
      writer.writerow(context)
    else: 
      indent(output % context, 2)
      print '-' * 79
Пример #13
0
 def execute(self, node, executor):
   client = KronosClient(node._host, blocking=True)
   return client.get(node.stream,
                     node.start_time,
                     node.end_time,
                     namespace=node.namespace)
Пример #14
0
 def execute(self, node, executor):
     client = KronosClient(node._host, blocking=True)
     return client.get(node.stream,
                       node.start_time,
                       node.end_time,
                       namespace=node.namespace)
Пример #15
0
def main(args):
    client1 = KronosClient(args.kronos_url1, namespace=args.namespace1)
    client2 = KronosClient(args.kronos_url2, namespace=args.namespace2)

    if args.streams_file:
        streams = map(
            lambda s: (s, s),  # Use same stream name for both.
            filter(lambda s: len(s),
                   open(args.streams_file).read().split('\n')))
    else:
        streams = [(args.stream1, args.stream2)]

    for stream1_name, stream2_name in streams:
        if args.num_samples:
            samples = []
            for _ in xrange(args.num_samples):
                start = random.randint(args.start,
                                       args.end - args.sample_interval)
                samples.append((start, start + args.sample_interval))
        else:
            samples = [(args.start, args.end)]

        total_stream1 = 0
        extra_stream1 = 0
        total_stream2 = 0
        extra_stream2 = 0

        for start, end in samples:
            stream1 = client1.get(stream1_name, start, end)
            stream2 = client2.get(stream2_name, start, end)

            # Sorting of events with the same timestamp may vary across backends,
            # hence we can't do a simple loop comparison. We need to aggregate all
            # events with the same timestamp from both streams and then compare the
            # two sets.
            stream1_hashes = set()
            stream2_hashes = set()
            current_timestamp = None
            while True:
                event1 = get_next(stream1)
                event2 = get_next(stream2)
                # Are both streams exhausted?
                if not (event1 or event2):
                    break
                # Pick the smaller timestamp from the two events.
                min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint),
                                    event2.get(TIMESTAMP_FIELD, sys.maxint))
                if current_timestamp is None:
                    current_timestamp = min_timestamp
                # If min_timestamp is greater than current_timestamp, then aggregate
                # stats for current_timestamp and roll over.
                if min_timestamp > current_timestamp:
                    total_stream1 += len(stream1_hashes)
                    total_stream2 += len(stream2_hashes)
                    extra_stream1 += len(stream1_hashes - stream2_hashes)
                    extra_stream2 += len(stream2_hashes - stream1_hashes)
                    stream1_hashes.clear()
                    stream2_hashes.clear()
                    current_timestamp = min_timestamp

                if event1:
                    assert event1[TIMESTAMP_FIELD] >= current_timestamp
                    if event1[TIMESTAMP_FIELD] == current_timestamp:
                        del event1[ID_FIELD]
                        stream1_hashes.add(
                            hashlib.sha224(json.dumps(
                                event1, sort_keys=True)).hexdigest())
                    else:
                        stream1 = push_back(event1, stream1)

                if event2:
                    assert event2[TIMESTAMP_FIELD] >= current_timestamp
                    if event2[TIMESTAMP_FIELD] == current_timestamp:
                        del event2[ID_FIELD]
                        stream2_hashes.add(
                            hashlib.sha224(json.dumps(
                                event2, sort_keys=True)).hexdigest())
                    else:
                        stream2 = push_back(event2, stream2)

        print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name,
                                          args.namespace2, stream2_name)
        print '< total: %d' % total_stream1
        print '> total: %d' % total_stream2
        print '< extra: %d' % extra_stream1
        print '> extra: %d' % extra_stream2
        print
Пример #16
0
def main(args):
  client1 = KronosClient(args.kronos_url1, namespace=args.namespace1)
  client2 = KronosClient(args.kronos_url2, namespace=args.namespace2)

  if args.streams_file:
    streams = map(lambda s: (s, s), # Use same stream name for both.
                  filter(lambda s: len(s),
                         open(args.streams_file).read().split('\n')))
  else:
    streams = [(args.stream1, args.stream2)]

  for stream1_name, stream2_name in streams:
    if args.num_samples:
      samples = []
      for _ in xrange(args.num_samples):
        start = random.randint(args.start, args.end - args.sample_interval)
        samples.append((start, start + args.sample_interval))
    else:
      samples = [(args.start, args.end)]

    total_stream1 = 0
    extra_stream1 = 0
    total_stream2 = 0
    extra_stream2 = 0

    for start, end in samples:
      stream1 = client1.get(stream1_name, start, end)
      stream2 = client2.get(stream2_name, start, end)

      # Sorting of events with the same timestamp may vary across backends,
      # hence we can't do a simple loop comparison. We need to aggregate all
      # events with the same timestamp from both streams and then compare the
      # two sets.
      stream1_hashes = set()
      stream2_hashes = set()
      current_timestamp = None
      while True:
        event1 = get_next(stream1)
        event2 = get_next(stream2)
        # Are both streams exhausted?
        if not (event1 or event2):
          break
        # Pick the smaller timestamp from the two events.
        min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint),
                            event2.get(TIMESTAMP_FIELD, sys.maxint))
        if current_timestamp is None:
          current_timestamp = min_timestamp
        # If min_timestamp is greater than current_timestamp, then aggregate
        # stats for current_timestamp and roll over.
        if min_timestamp > current_timestamp:
          total_stream1 += len(stream1_hashes)
          total_stream2 += len(stream2_hashes)
          extra_stream1 += len(stream1_hashes - stream2_hashes)
          extra_stream2 += len(stream2_hashes - stream1_hashes)
          stream1_hashes.clear()
          stream2_hashes.clear()
          current_timestamp = min_timestamp
    
        if event1:
          assert event1[TIMESTAMP_FIELD] >= current_timestamp
          if event1[TIMESTAMP_FIELD] == current_timestamp:
            del event1[ID_FIELD]
            stream1_hashes.add(
              hashlib.sha224(json.dumps(event1, sort_keys=True)).hexdigest())
          else:
            stream1 = push_back(event1, stream1)

        if event2:
          assert event2[TIMESTAMP_FIELD] >= current_timestamp
          if event2[TIMESTAMP_FIELD] == current_timestamp:
            del event2[ID_FIELD]
            stream2_hashes.add(
              hashlib.sha224(json.dumps(event2, sort_keys=True)).hexdigest())
          else:
           stream2 = push_back(event2, stream2)

    print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name,
                                      args.namespace2, stream2_name)
    print '< total: %d' % total_stream1
    print '> total: %d' % total_stream2
    print '< extra: %d' % extra_stream1
    print '> extra: %d' % extra_stream2
    print 
class QueryCacheTest(unittest.TestCase):
  def setUp(self):
    self.client = KronosClient('http://localhost:9191/',
                               blocking=False,
                               sleep_block=0.2)
    self.total_events = 500
    self.computed_namespace = 'computed'
    self.increment = timedelta(minutes=1)
    self.start_time = datetime(2014, 6, 4, 22)
    self.bucket_width = timedelta(minutes=20)

  def compute_cache_test(function):
    """A wrapper that sets up a stream with test data.

    The stream takes the name of the function being run, and contains
    `self.total_events` events.  The events are each one
    `self.increment` apart.
    """
    @functools.wraps(function)
    def wrapper(self):
      self.stream = 'ComputeCacheTest_%s' % (function.__name__)
      for i in xrange(self.total_events):
        self.client.put({
          self.stream: [{TIMESTAMP_FIELD:
                         self.start_time + (self.increment * i),
                         'a': i % 5, 'b': i}]})
      self.client.flush()
      function(self)
    return wrapper

  def filter_and_sum(self, start_time, end_time):
    """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2.

    For all events between `start_time` and `end_time`, create an
    event for every 20-minute interval of events that contains the sum
    of `b` when `a`==2.
    """
    events = self.client.get(self.stream, start_time, end_time)
    counts = defaultdict(int)
    grouping_minutes = timedelta_to_kronos_time(self.bucket_width)
    for event in events:
      if event['a'] == 2:
        counts[event['@time'] -
               (event['@time'] % grouping_minutes)] += event['b']
    for group_time in sorted(counts.iterkeys()):
      yield {'@time': group_time, 'b_sum': counts[group_time]}

  def verify_results(self, result_func, cache, expected_results,
                     expected_computations):
    with patch.object(cache, '_compute_bucket',
                      wraps=cache._compute_bucket) as mock_method:
      results = result_func()
      self.assertEqual(mock_method.call_count, expected_computations)

    self.assertEqual(len(results), expected_results)
    result_time = self.start_time
    for idx, result in enumerate(results):
      self.assertEqual(result[TIMESTAMP_FIELD],
                       datetime_to_kronos_time(result_time))
      self.assertEqual(
        result['b_sum'],
        sum([2, 7, 12, 17] +
            [idx * 4 * (self.bucket_width.total_seconds() / 60)]))
      result_time += self.bucket_width

  def test_cache_exceptions(self):
    # Bucket width shouldn't be more granular than 1 second.
    def bad_bucket_width():
      return QueryCache(self.client, self.filter_and_sum,
                        self.bucket_width + timedelta(milliseconds=1),
                        self.computed_namespace)
    self.assertRaises(ValueError, bad_bucket_width)

    # start_time and end_time should align to bucket_width boundaries.
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)

    def bad_start_boundary():
      return list(
        cache.retrieve_interval(start_time + timedelta(minutes=1),
                                end_time))
    self.assertRaises(ValueError, bad_start_boundary)

  @compute_cache_test
  def test_cache_layer(self):
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)
    untrusted_time = self.start_time + (
      timedelta(minutes=(self.total_events / 2) - 25))

    # Verify all results were computed correctly.
    self.verify_results(lambda: list(
        cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                untrusted_time)),
                        cache, 25, 31)

    # Verify only trusted results are cached.
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Running the same operations twice should result in the same
    # results as before.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Expanding the time range without caching should also result in the same
    # results
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width)),
      cache, 11, 0)

    # But specifying compute_missing should get all results for the timerange
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width,
                                           compute_missing=True)),
      cache, 25, 19)

    # Overlapping time queries should result in the same
    # results as before, and benefit from the cache.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time -
                                                           self.bucket_width,
                                                           end_time +
                                                           self.bucket_width,
                                                           untrusted_time)),
      cache, 25, 19)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Increasing the trusted time should increase the cached results.
    untrusted_time = untrusted_time + timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Decreasing trusted time shouldn't remove results.
    untrusted_time = untrusted_time - timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 15)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # If there are two cached entries, that cached time should no
    # longer be returned.
    results = list(cache.retrieve_interval(start_time, end_time))
    duplicate_result = dict(results[10])
    duplicate_result['b_sum'] = 0
    self.client.put({cache._scratch_stream: [duplicate_result]},
                    namespace=cache._scratch_namespace)
    self.client.flush()
    safe_results = list(cache.retrieve_interval(start_time, end_time))
    self.assertEqual(results[:10] + results[11:], safe_results)

    # Rerunning the cache/computation should re-cache the corrupted
    # element.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 16)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Forcing computation should generate the same result set.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(
          start_time, end_time, untrusted_time, force_recompute=True)),
      cache, 25, 31)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)