Пример #1
0
def main(args):
    read_client = KronosClient(args.read_url, namespace=args.read_namespace)
    write_client = KronosClient(args.write_url,
                                namespace=args.write_namespace,
                                blocking=False)
    start_time = time.time()
    time_step = timedelta(seconds=args.copy_period_seconds)
    for stream in args.stream_file:
        stream = stream.rstrip()
        print 'Starting stream', stream, time.time() - start_time
        start = args.start
        # Keep track of the last ID we read, so we re-run queries from
        # there.
        last_read_id = None
        while start <= args.end:
            print '...start is', start, time.time() - start_time
            end = min(args.end, start + time_step)
            if last_read_id is None:
                read_stream = read_client.get(stream, start, end)
            else:
                read_stream = read_client.get(stream,
                                              None,
                                              end,
                                              start_id=last_read_id)
            for event in read_stream:
                if event[ID_FIELD] != last_read_id:
                    last_read_id = event[ID_FIELD]
                    del event[ID_FIELD]
                    write_client.put({stream: [event]})
            start += time_step
            write_client.flush()
        print 'Completed stream', stream, time.time() - start_time
Пример #2
0
def main(args):
  read_client = KronosClient(args.read_url, namespace=args.read_namespace)
  write_client = KronosClient(args.write_url, namespace=args.write_namespace,
                              blocking=False)
  start_time = time.time()
  time_step = timedelta(seconds=args.copy_period_seconds)
  for stream in args.stream_file:
    stream = stream.rstrip()
    print 'Starting stream', stream, time.time() - start_time
    start = args.start
    # Keep track of the last ID we read, so we re-run queries from
    # there.
    last_read_id = None
    while start <= args.end:
      print '...start is', start, time.time() - start_time
      end = min(args.end, start + time_step)
      if last_read_id is None:
        read_stream = read_client.get(stream, start, end)
      else:
        read_stream = read_client.get(stream, None, end, start_id=last_read_id)
      for event in read_stream:
        if event[ID_FIELD] != last_read_id:
          last_read_id = event[ID_FIELD]
          del event[ID_FIELD]
          write_client.put({stream: [event]})
      start += time_step
      write_client.flush()
    print 'Completed stream', stream, time.time() - start_time    
def load_test_data(args):
    donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read()))
    donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME))

    events = []
    rows = csv.DictReader(donations)
    for row in rows:
        row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt'])
        events.append(row)

    kc = KronosClient(args.kronos_url)
    kc.put({'donations': events})
Пример #4
0
def load_test_data(args):
  donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read()))
  donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME))

  events = []
  rows = csv.DictReader(donations)
  for row in rows:
    row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt'])
    events.append(row)

  kc = KronosClient(args.kronos_url)
  kc.put({'donations': events})
Пример #5
0
def main(args):
  client = KronosClient(args.kronos_url, namespace=args.namespace,
                        blocking=False)
  increment = timedelta(microseconds=args.microseconds_between_events)
  event = {'property%s' % (idx): idx 
           for idx in xrange(args.properties_per_event)}
  start_time = time.time()
  for idx in xrange(args.num_events):
    event[TIMESTAMP_FIELD] = args.start + (idx * increment)
    client.put({args.stream: [event]})
    if (idx % args.chunk_size) == 0:
      print 'Completed', idx, 'events', time.time() - start_time
      client.flush()
  client.flush()
class QueryCacheTest(unittest.TestCase):
  def setUp(self):
    self.client = KronosClient('http://localhost:9191/',
                               blocking=False,
                               sleep_block=0.2)
    self.total_events = 500
    self.computed_namespace = 'computed'
    self.increment = timedelta(minutes=1)
    self.start_time = datetime(2014, 6, 4, 22)
    self.bucket_width = timedelta(minutes=20)

  def compute_cache_test(function):
    """A wrapper that sets up a stream with test data.

    The stream takes the name of the function being run, and contains
    `self.total_events` events.  The events are each one
    `self.increment` apart.
    """
    @functools.wraps(function)
    def wrapper(self):
      self.stream = 'ComputeCacheTest_%s' % (function.__name__)
      for i in xrange(self.total_events):
        self.client.put({
          self.stream: [{TIMESTAMP_FIELD:
                         self.start_time + (self.increment * i),
                         'a': i % 5, 'b': i}]})
      self.client.flush()
      function(self)
    return wrapper

  def filter_and_sum(self, start_time, end_time):
    """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2.

    For all events between `start_time` and `end_time`, create an
    event for every 20-minute interval of events that contains the sum
    of `b` when `a`==2.
    """
    events = self.client.get(self.stream, start_time, end_time)
    counts = defaultdict(int)
    grouping_minutes = timedelta_to_kronos_time(self.bucket_width)
    for event in events:
      if event['a'] == 2:
        counts[event['@time'] -
               (event['@time'] % grouping_minutes)] += event['b']
    for group_time in sorted(counts.iterkeys()):
      yield {'@time': group_time, 'b_sum': counts[group_time]}

  def verify_results(self, result_func, cache, expected_results,
                     expected_computations):
    with patch.object(cache, '_compute_bucket',
                      wraps=cache._compute_bucket) as mock_method:
      results = result_func()
      self.assertEqual(mock_method.call_count, expected_computations)

    self.assertEqual(len(results), expected_results)
    result_time = self.start_time
    for idx, result in enumerate(results):
      self.assertEqual(result[TIMESTAMP_FIELD],
                       datetime_to_kronos_time(result_time))
      self.assertEqual(
        result['b_sum'],
        sum([2, 7, 12, 17] +
            [idx * 4 * (self.bucket_width.total_seconds() / 60)]))
      result_time += self.bucket_width

  def test_cache_exceptions(self):
    # Bucket width shouldn't be more granular than 1 second.
    def bad_bucket_width():
      return QueryCache(self.client, self.filter_and_sum,
                        self.bucket_width + timedelta(milliseconds=1),
                        self.computed_namespace)
    self.assertRaises(ValueError, bad_bucket_width)

    # start_time and end_time should align to bucket_width boundaries.
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)

    def bad_start_boundary():
      return list(
        cache.retrieve_interval(start_time + timedelta(minutes=1),
                                end_time))
    self.assertRaises(ValueError, bad_start_boundary)

  @compute_cache_test
  def test_cache_layer(self):
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)
    untrusted_time = self.start_time + (
      timedelta(minutes=(self.total_events / 2) - 25))

    # Verify all results were computed correctly.
    self.verify_results(lambda: list(
        cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                untrusted_time)),
                        cache, 25, 31)

    # Verify only trusted results are cached.
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Running the same operations twice should result in the same
    # results as before.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Expanding the time range without caching should also result in the same
    # results
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width)),
      cache, 11, 0)

    # But specifying compute_missing should get all results for the timerange
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width,
                                           compute_missing=True)),
      cache, 25, 19)

    # Overlapping time queries should result in the same
    # results as before, and benefit from the cache.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time -
                                                           self.bucket_width,
                                                           end_time +
                                                           self.bucket_width,
                                                           untrusted_time)),
      cache, 25, 19)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Increasing the trusted time should increase the cached results.
    untrusted_time = untrusted_time + timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Decreasing trusted time shouldn't remove results.
    untrusted_time = untrusted_time - timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 15)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # If there are two cached entries, that cached time should no
    # longer be returned.
    results = list(cache.retrieve_interval(start_time, end_time))
    duplicate_result = dict(results[10])
    duplicate_result['b_sum'] = 0
    self.client.put({cache._scratch_stream: [duplicate_result]},
                    namespace=cache._scratch_namespace)
    self.client.flush()
    safe_results = list(cache.retrieve_interval(start_time, end_time))
    self.assertEqual(results[:10] + results[11:], safe_results)

    # Rerunning the cache/computation should re-cache the corrupted
    # element.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 16)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Forcing computation should generate the same result set.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(
          start_time, end_time, untrusted_time, force_recompute=True)),
      cache, 25, 31)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)