Пример #1
0
def main(args):
    read_client = KronosClient(args.read_url, namespace=args.read_namespace)
    write_client = KronosClient(args.write_url,
                                namespace=args.write_namespace,
                                blocking=False)
    start_time = time.time()
    time_step = timedelta(seconds=args.copy_period_seconds)
    for stream in args.stream_file:
        stream = stream.rstrip()
        print 'Starting stream', stream, time.time() - start_time
        start = args.start
        # Keep track of the last ID we read, so we re-run queries from
        # there.
        last_read_id = None
        while start <= args.end:
            print '...start is', start, time.time() - start_time
            end = min(args.end, start + time_step)
            if last_read_id is None:
                read_stream = read_client.get(stream, start, end)
            else:
                read_stream = read_client.get(stream,
                                              None,
                                              end,
                                              start_id=last_read_id)
            for event in read_stream:
                if event[ID_FIELD] != last_read_id:
                    last_read_id = event[ID_FIELD]
                    del event[ID_FIELD]
                    write_client.put({stream: [event]})
            start += time_step
            write_client.flush()
        print 'Completed stream', stream, time.time() - start_time
Пример #2
0
def main(args):
  client = KronosClient(args.kronos_url)
  results = client.get(args.stream, args.start, args.end,
                       namespace=args.namespace)
  if args.display == 'print':
    if args.type == 'json':
      events = []
      for event in results:
        events.append(event)
      print json.dumps(events)
    elif args.type == 'one-per-line':
      for event in results:
        print event
  elif args.display == 'csv':
    writer = csv.DictWriter(sys.stdout, args.fields)
    if not args.remove_header:
      writer.writeheader()
    for event in results:
      row_values = {}
      for field in args.fields:
        field_value = get_property(event, field)
        row_values[field] = (field_value.encode('utf-8')
                             if isinstance(field_value, unicode)
                             else field_value)
      writer.writerow(row_values)
  elif args.display == 'aggregate':
    aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator],
                                  args.field, args.time_bucket_width)
    print 'Bucket, Aggregate'
    for bucket, aggregate in aggregates:
      print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate)
  else:
    raise Exception('Invalid display option {}'.format(args.display))
Пример #3
0
def main(args):
    client = KronosClient(args.kronos_url)
    results = client.get(args.stream,
                         args.start,
                         args.end,
                         namespace=args.namespace)
    if args.display == 'print':
        if args.type == 'json':
            events = []
            for event in results:
                events.append(event)
            print json.dumps(events)
        elif args.type == 'one-per-line':
            for event in results:
                print event
    elif args.display == 'csv':
        writer = csv.DictWriter(sys.stdout, args.fields)
        if not args.remove_header:
            writer.writeheader()
        for event in results:
            row_values = {}
            for field in args.fields:
                field_value = get_property(event, field)
                row_values[field] = (field_value.encode('utf-8') if isinstance(
                    field_value, unicode) else field_value)
            writer.writerow(row_values)
    elif args.display == 'aggregate':
        aggregates = aggregate_stream(results, AGGREGATORS[args.aggregator],
                                      args.field, args.time_bucket_width)
        print 'Bucket, Aggregate'
        for bucket, aggregate in aggregates:
            print '%s, %s' % (datetime.fromtimestamp(bucket), aggregate)
    else:
        raise Exception('Invalid display option {}'.format(args.display))
Пример #4
0
def streams():
  kc = KronosClient(current_app.config['KRONOS_URL'],
                    namespace=current_app.config['KRONOS_NAMESPACE'])
  kstreams = kc.get_streams(namespace=current_app.config['KRONOS_NAMESPACE'])
  kstreams = sorted(kstreams)
  return {
    'streams': kstreams,
  }
Пример #5
0
    def execute_kronos_stream(self, node):
        from pykronos import KronosClient

        client = KronosClient(node.host, blocking=True)
        return client.get(node.stream,
                          node.start_time,
                          node.end_time,
                          namespace=node.namespace)
Пример #6
0
def streams():
  client = KronosClient(app.config['KRONOS_URL'],
                        namespace=app.config['KRONOS_NAMESPACE'])
  kronos_streams = client.get_streams(namespace=app.config['KRONOS_NAMESPACE'])
  kronos_streams = sorted(kronos_streams)
  return {
    'streams': kronos_streams,
  }
Пример #7
0
def streams():
    kc = KronosClient(current_app.config['KRONOS_URL'],
                      namespace=current_app.config['KRONOS_NAMESPACE'])
    kstreams = kc.get_streams(namespace=current_app.config['KRONOS_NAMESPACE'])
    kstreams = sorted(kstreams)
    return {
        'streams': kstreams,
    }
Пример #8
0
  def execute_kronos_stream(self, node):
    from pykronos import KronosClient

    client = KronosClient(node.host, blocking=True)
    return client.get(node.stream,
                      node.start_time,
                      node.end_time,
                      namespace=node.namespace)
Пример #9
0
def streams():
  client = KronosClient(app.config['KRONOS_URL'],
                        namespace=app.config['KRONOS_NAMESPACE'])
  kronos_streams = client.get_streams(namespace=app.config['KRONOS_NAMESPACE'])
  kronos_streams = sorted(kronos_streams)
  return {
    'streams': kronos_streams,
  }
Пример #10
0
  def execute_kronos_stream(self, node):
    # TODO(usmanm): Read time slices of events in parallel from worker nodes.
    from pykronos import KronosClient

    client = KronosClient(node.host, blocking=True)
    events = client.get(node.stream,
                        node.start_time,
                        node.end_time,
                        namespace=node.namespace)
    return self.context.parallelize(events)
Пример #11
0
 def get_events(i):
   client = KronosClient(node._host, blocking=True)
   start_time = node.start_time + (i * delta)
   if i == executor.parallelism - 1:
     end_time = node.end_time
   else:
     end_time = start_time + delta - 1
   return list(client.get(node.stream,
                          start_time,
                          end_time,
                          namespace=node.namespace))
Пример #12
0
 def get_events(i):
     client = KronosClient(node._host, blocking=True)
     start_time = node.start_time + (i * delta)
     if i == executor.parallelism - 1:
         end_time = node.end_time
     else:
         end_time = start_time + delta - 1
     return list(
         client.get(node.stream,
                    start_time,
                    end_time,
                    namespace=node.namespace))
Пример #13
0
def load_test_data(args):
  donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read()))
  donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME))

  events = []
  rows = csv.DictReader(donations)
  for row in rows:
    row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt'])
    events.append(row)

  kc = KronosClient(args.kronos_url)
  kc.put({'donations': events})
Пример #14
0
def load_test_data(args):
    donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read()))
    donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME))

    events = []
    rows = csv.DictReader(donations)
    for row in rows:
        row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt'])
        events.append(row)

    kc = KronosClient(args.kronos_url)
    kc.put({'donations': events})
Пример #15
0
 def get_events(i):
   from pykronos import KronosClient
   client = KronosClient(node.host, blocking=True)
   start_time = node.start_time + (i * delta)
   if i == self.parallelism - 1:
     end_time = node.end_time
   else:
     end_time = start_time + delta - 1
   return list(client.get(node.stream,
                          start_time,
                          end_time,
                          namespace=node.namespace))
Пример #16
0
def main(args):
    client = KronosClient(args.kronos_url)
    headers = [
        'stream',
        'total_events',
        'events_per_day',
        'events_per_sec',
        'payload_total_bytes',
        'payload_avg_bytes',
        'payload_med_bytes',
        'payload_95_bytes',
        'payload_99_bytes',
        'schema',
    ]
    if args.csv:
        csv_file = open(args.csv, 'w')
        writer = csv.DictWriter(csv_file, headers)
        writer.writeheader()
    else:
        print '-' * 79
    for stream in client.get_streams():
        total_events = 0
        payloads = []
        for event in client.get(stream, args.start, args.end):
            payloads.append(len(ujson.dumps(event)))
            total_events += 1
        if total_events == 0:
            indent('%s has no events' % stream, 2)
            print '-' * 79
            continue
        timeframe_sec = (args.end - args.start).total_seconds()
        schema = client.infer_schema(stream)['schema']
        context = dict(
            zip(headers, [
                stream,
                total_events,
                (float(total_events) / timeframe_sec) * 60 * 60 * 24,
                float(total_events) / timeframe_sec,
                np.sum(payloads),
                np.mean(payloads),
                np.median(payloads),
                np.percentile(payloads, 95),
                np.percentile(payloads, 99),
                schema,
            ]))
        if args.csv:
            writer.writerow(context)
        else:
            indent(output % context, 2)
            print '-' * 79
Пример #17
0
def main(args):
  read_client = KronosClient(args.read_url, namespace=args.read_namespace)
  write_client = KronosClient(args.write_url, namespace=args.write_namespace,
                              blocking=False)
  start_time = time.time()
  time_step = timedelta(seconds=args.copy_period_seconds)
  for stream in args.stream_file:
    stream = stream.rstrip()
    print 'Starting stream', stream, time.time() - start_time
    start = args.start
    # Keep track of the last ID we read, so we re-run queries from
    # there.
    last_read_id = None
    while start <= args.end:
      print '...start is', start, time.time() - start_time
      end = min(args.end, start + time_step)
      if last_read_id is None:
        read_stream = read_client.get(stream, start, end)
      else:
        read_stream = read_client.get(stream, None, end, start_id=last_read_id)
      for event in read_stream:
        if event[ID_FIELD] != last_read_id:
          last_read_id = event[ID_FIELD]
          del event[ID_FIELD]
          write_client.put({stream: [event]})
      start += time_step
      write_client.flush()
    print 'Completed stream', stream, time.time() - start_time    
def main(args):
  client = KronosClient(args.kronos_url)
  headers = [
    'stream',
    'total_events',
    'events_per_day',
    'events_per_sec',
    'payload_total_bytes',
    'payload_avg_bytes',
    'payload_med_bytes',
    'payload_95_bytes',
    'payload_99_bytes',
    'schema',
  ]
  if args.csv:
    csv_file = open(args.csv, 'w')
    writer = csv.DictWriter(csv_file, headers)
    writer.writeheader()
  else:
    print '-' * 79
  for stream in client.get_streams():
    total_events = 0
    payloads = []
    for event in client.get(stream, args.start, args.end):
      payloads.append(len(ujson.dumps(event)))
      total_events += 1
    if total_events == 0:
      indent('%s has no events' % stream, 2)
      print '-' * 79
      continue
    timeframe_sec = (args.end - args.start).total_seconds()
    schema = client.infer_schema(stream)['schema']
    context = dict(zip(headers, [
      stream,
      total_events,
      (float(total_events) / timeframe_sec) * 60 * 60 * 24,
      float(total_events) / timeframe_sec,
      np.sum(payloads),
      np.mean(payloads),
      np.median(payloads),
      np.percentile(payloads, 95),
      np.percentile(payloads, 99),
      schema,
    ]))
    if args.csv:
      writer.writerow(context)
    else: 
      indent(output % context, 2)
      print '-' * 79
Пример #19
0
  def _run_query(self, start_time, end_time, unique_id=None):
    """Executes a Python query string and returns events

    Acts as a wrapper around exec that injects necessary local variables into
    the scope of the user-provided query blob.

    :param start_time: Python datetime to be injected into query
    :param end_time: Python datetime to be injected into query
    :param unique_id: An unused flag that allows the scheduler to hash this
    function uniquely based on its args when it passes through
    """
    client = KronosClient(self._app.config['KRONOS_URL'],
                          namespace=self._app.config['KRONOS_NAMESPACE'],
                          blocking=False,
                          sleep_block=0.2)

    locals_dict = {
      'kronos_client': client,
      'events': [],
      'start_time': start_time,
      'end_time': end_time,
    }

    try:
      exec self._query in {}, locals_dict  # No globals.
    except:
      _, exception, tb = sys.exc_info()
      raise PyCodeError(exception, traceback.format_tb(tb))

    events = sorted(locals_dict.get('events', []),
                    key=lambda event: event['@time'])

    return events
Пример #20
0
  def _run_query(self, start_time, end_time, unique_id=None):
    """Executes a Python query string and returns events

    Acts as a wrapper around exec that injects necessary local variables into
    the scope of the user-provided query blob.

    :param start_time: Python datetime to be injected into query
    :param end_time: Python datetime to be injected into query
    :param unique_id: An unused flag that allows the scheduler to hash this
    function uniquely based on its args when it passes through
    """
    # XXX(derek): DEPRECATION WARNING
    # Use of the implicit Kronos client in pycode queries is deprecated
    client = KronosClient(self._app.config['KRONOS_URL'],
                          namespace=self._app.config['KRONOS_NAMESPACE'],
                          blocking=False,
                          sleep_block=0.2)
    locals_dict = {
      'kronos_client': client,
      'events': [],
      'start_time': start_time,
      'end_time': end_time,
    }
    try:
      exec self._query in {}, locals_dict  # No globals.
    except:
      _, exception, tb = sys.exc_info()
      raise PyCodeError(exception, traceback.format_tb(tb))

    # Retrieve the `events` variable as computed by the pycode.
    events = locals_dict.get('events', [])

    return events
Пример #21
0
def main(args):
  client = KronosClient(args.kronos_url)
  if args.fetch_timeout:
    start = args.start
    end = args.end
    limit = None
  else:
    start = 0
    end = datetime.utcnow()
    limit = 1000

  for stream in client.get_streams(namespace=args.namespace):
    if not (args.read_latency or args.fetch_timeout):
      print stream
    elif check_stream(client, args.namespace, stream, start, end, limit,
                      args.fetch_timeout, args.read_latency):
      print stream
Пример #22
0
def main(args):
    client = KronosClient(args.kronos_url, namespace=args.namespace)
    unfiltered_streams = [(stream, None, args.user_field)
                          for stream in args.streams]
    stream_sizes = funnel_analyze(client, unfiltered_streams, args.start,
                                  args.end, args.end, {}, None)
    # TODO(marcua): print something more meaningful here.
    print stream_sizes
Пример #23
0
def main(args):
    client = KronosClient(args.kronos_url)
    if args.fetch_timeout:
        start = args.start
        end = args.end
        limit = None
    else:
        start = 0
        end = datetime.utcnow()
        limit = 1000

    for stream in client.get_streams(namespace=args.namespace):
        if not (args.read_latency or args.fetch_timeout):
            print stream
        elif check_stream(client, args.namespace, stream, start, end, limit,
                          args.fetch_timeout, args.read_latency):
            print stream
 def setUp(self):
   self.client = KronosClient('http://localhost:9191/',
                              blocking=False,
                              sleep_block=0.2)
   self.total_events = 500
   self.computed_namespace = 'computed'
   self.increment = timedelta(minutes=1)
   self.start_time = datetime(2014, 6, 4, 22)
   self.bucket_width = timedelta(minutes=20)
Пример #25
0
 def setUp(self):
     self.kronos_client = KronosClient('http://localhost:9191')
     self.index_path = '1.0/index'
     self.source_path = '1.0/sources'
     self.streams_path = '1.0/streams/kronos'
     self.schema_path = '1.0/streams/kronos/%s'
     self.query_path = '1.0/query'
     self.server_url = 'http://localhost:9192/%s'
     self.executor = None
Пример #26
0
def main(args):
  client = KronosClient(args.kronos_url, namespace=args.namespace,
                        blocking=False)
  increment = timedelta(microseconds=args.microseconds_between_events)
  event = {'property%s' % (idx): idx 
           for idx in xrange(args.properties_per_event)}
  start_time = time.time()
  for idx in xrange(args.num_events):
    event[TIMESTAMP_FIELD] = args.start + (idx * increment)
    client.put({args.stream: [event]})
    if (idx % args.chunk_size) == 0:
      print 'Completed', idx, 'events', time.time() - start_time
      client.flush()
  client.flush()
Пример #27
0
 def setUp(self):
     self.kronos_client = KronosClient('http://localhost:9191')
     self.index_path = '1.0/index'
     self.query_path = '1.0/query'
     self.server_url = 'http://localhost:9192/%s'
     self.executor = None
Пример #28
0
 def execute(self, node, executor):
   client = KronosClient(node._host, blocking=True)
   return client.get(node.stream,
                     node.start_time,
                     node.end_time,
                     namespace=node.namespace)
Пример #29
0
 def setUp(self):
     self.blocking_client = KronosClient('http://localhost:9191/',
                                         blocking=True)
     self.nonblocking_client = KronosClient('http://localhost:9191/',
                                            blocking=False,
                                            sleep_block=0.2)
Пример #30
0
def infer_schema(stream_name=None):
  kc = KronosClient(current_app.config['KRONOS_URL'],
                    namespace=current_app.config['KRONOS_NAMESPACE'])
  schema = kc.infer_schema(stream_name,
                           namespace=current_app.config['KRONOS_NAMESPACE'])
  return schema
Пример #31
0
  def __init__(self, query, timeframe, bucket_width=None, untrusted_time=None,
               metis=False):
    """Initialize QueryCompute
    :param query: A string of python code to execute as a Jia query.
    :param timeframe: A timeframe dictionary. It specifies a mode, which can be
    'recent' or 'range'. Depending on which mode is selected, some of the other
    parameters will be unused. The unused parameters come from the frontend for
    the purposes of storing default/previous values. If the mode is recent,
    only 'value' and 'scale' are used. If the mode is 'range', only 'from' and
    'to' are used.

    Example timeframe:
    timeframe = {
      'mode': 'recent',
      'value': 1,
      'scale': 'days',
      'from': 'Sat Jun 10 2014 00:00:00',
      'to': 'Sun Jun 11 2014 00:00:00',
    }

    :param bucket_width: Optional bucket width in seconds
    :param untrusted_time: Optional untrusted time interval in seconds
    :param metis: Send `query` to metis for computation
    """
    try:
      self._app = current_app
      self._app.config  # The above line won't fail, but this one will
    except RuntimeError:
      from scheduler import get_app 
      self._app = get_app() 
    self._query = query
    self._bucket_width = bucket_width
    self._untrusted_time = untrusted_time
    self._metis = metis
    self._start_time, self._end_time = self._get_timeframe_bounds(timeframe,
                                                                  bucket_width)

    self._cache_client = KronosClient(
        self._app.config['CACHE_KRONOS_URL'],
        namespace=self._app.config['CACHE_KRONOS_NAMESPACE'],
        blocking=False, sleep_block=0.2)

    # The query is sent through as an unused unique_id argument so that the
    # QueryCache hash can properly uniquely identify it
    unique = {
      'unique_id': self._query
    }

    if self._metis:
      query_func = self._run_metis
    elif self._app.config['ALLOW_PYCODE']:
      query_func = self._run_query
    else:
      raise ValueError("`metis` must be `True` if ALLOW_PYCODE is not enabled")

    if self._bucket_width:
      bucket_width_timedelta = datetime.timedelta(seconds=bucket_width)
      self._query_cache = QueryCache(self._cache_client, query_func,
                                     bucket_width_timedelta,
                                     self._app.config['CACHE_KRONOS_NAMESPACE'],
                                     query_function_kwargs=unique)
Пример #32
0
 def execute(self, node, executor):
     client = KronosClient(node._host, blocking=True)
     return client.get(node.stream,
                       node.start_time,
                       node.end_time,
                       namespace=node.namespace)
Пример #33
0
def infer_schema(stream_name=None):
  client = KronosClient(app.config['KRONOS_URL'],
                        namespace=app.config['KRONOS_NAMESPACE'])
  schema = client.infer_schema(stream_name,
                               namespace=app.config['KRONOS_NAMESPACE'])
  return schema
Пример #34
0
def main(args):
    client1 = KronosClient(args.kronos_url1, namespace=args.namespace1)
    client2 = KronosClient(args.kronos_url2, namespace=args.namespace2)

    if args.streams_file:
        streams = map(
            lambda s: (s, s),  # Use same stream name for both.
            filter(lambda s: len(s),
                   open(args.streams_file).read().split('\n')))
    else:
        streams = [(args.stream1, args.stream2)]

    for stream1_name, stream2_name in streams:
        if args.num_samples:
            samples = []
            for _ in xrange(args.num_samples):
                start = random.randint(args.start,
                                       args.end - args.sample_interval)
                samples.append((start, start + args.sample_interval))
        else:
            samples = [(args.start, args.end)]

        total_stream1 = 0
        extra_stream1 = 0
        total_stream2 = 0
        extra_stream2 = 0

        for start, end in samples:
            stream1 = client1.get(stream1_name, start, end)
            stream2 = client2.get(stream2_name, start, end)

            # Sorting of events with the same timestamp may vary across backends,
            # hence we can't do a simple loop comparison. We need to aggregate all
            # events with the same timestamp from both streams and then compare the
            # two sets.
            stream1_hashes = set()
            stream2_hashes = set()
            current_timestamp = None
            while True:
                event1 = get_next(stream1)
                event2 = get_next(stream2)
                # Are both streams exhausted?
                if not (event1 or event2):
                    break
                # Pick the smaller timestamp from the two events.
                min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint),
                                    event2.get(TIMESTAMP_FIELD, sys.maxint))
                if current_timestamp is None:
                    current_timestamp = min_timestamp
                # If min_timestamp is greater than current_timestamp, then aggregate
                # stats for current_timestamp and roll over.
                if min_timestamp > current_timestamp:
                    total_stream1 += len(stream1_hashes)
                    total_stream2 += len(stream2_hashes)
                    extra_stream1 += len(stream1_hashes - stream2_hashes)
                    extra_stream2 += len(stream2_hashes - stream1_hashes)
                    stream1_hashes.clear()
                    stream2_hashes.clear()
                    current_timestamp = min_timestamp

                if event1:
                    assert event1[TIMESTAMP_FIELD] >= current_timestamp
                    if event1[TIMESTAMP_FIELD] == current_timestamp:
                        del event1[ID_FIELD]
                        stream1_hashes.add(
                            hashlib.sha224(json.dumps(
                                event1, sort_keys=True)).hexdigest())
                    else:
                        stream1 = push_back(event1, stream1)

                if event2:
                    assert event2[TIMESTAMP_FIELD] >= current_timestamp
                    if event2[TIMESTAMP_FIELD] == current_timestamp:
                        del event2[ID_FIELD]
                        stream2_hashes.add(
                            hashlib.sha224(json.dumps(
                                event2, sort_keys=True)).hexdigest())
                    else:
                        stream2 = push_back(event2, stream2)

        print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name,
                                          args.namespace2, stream2_name)
        print '< total: %d' % total_stream1
        print '> total: %d' % total_stream2
        print '< extra: %d' % extra_stream1
        print '> extra: %d' % extra_stream2
        print
Пример #35
0
import time
import uuid

from kronos.common.time import epoch_time_to_kronos_time
from kronos.conf.constants import TIMESTAMP_FIELD
from pykronos import KronosClient

kronos = KronosClient('http://localhost:9191/', blocking=True)


def timeit(desc, func, *args, **kwargs):
    start = time.time()
    func(*args, **kwargs)
    print '  - %s took %ss' % (desc, time.time() - start)


def _make_event(t):
    return {
        TIMESTAMP_FIELD: epoch_time_to_kronos_time(t),
        'property1': str(uuid.uuid4()),
        'property2': str(uuid.uuid4()),
        'property3': str(uuid.uuid4())
    }


def insert(stream, n, chunk_size=10000):
    for _ in xrange(0, n, chunk_size):
        events = []
        for t in xrange(0, chunk_size):
            events.append(_make_event(t % 1000))
        kronos.put({stream: events})
Пример #36
0
def infer_schema(stream_name=None):
    kc = KronosClient(current_app.config['KRONOS_URL'],
                      namespace=current_app.config['KRONOS_NAMESPACE'])
    schema = kc.infer_schema(stream_name,
                             namespace=current_app.config['KRONOS_NAMESPACE'])
    return schema
class QueryCacheTest(unittest.TestCase):
  def setUp(self):
    self.client = KronosClient('http://localhost:9191/',
                               blocking=False,
                               sleep_block=0.2)
    self.total_events = 500
    self.computed_namespace = 'computed'
    self.increment = timedelta(minutes=1)
    self.start_time = datetime(2014, 6, 4, 22)
    self.bucket_width = timedelta(minutes=20)

  def compute_cache_test(function):
    """A wrapper that sets up a stream with test data.

    The stream takes the name of the function being run, and contains
    `self.total_events` events.  The events are each one
    `self.increment` apart.
    """
    @functools.wraps(function)
    def wrapper(self):
      self.stream = 'ComputeCacheTest_%s' % (function.__name__)
      for i in xrange(self.total_events):
        self.client.put({
          self.stream: [{TIMESTAMP_FIELD:
                         self.start_time + (self.increment * i),
                         'a': i % 5, 'b': i}]})
      self.client.flush()
      function(self)
    return wrapper

  def filter_and_sum(self, start_time, end_time):
    """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2.

    For all events between `start_time` and `end_time`, create an
    event for every 20-minute interval of events that contains the sum
    of `b` when `a`==2.
    """
    events = self.client.get(self.stream, start_time, end_time)
    counts = defaultdict(int)
    grouping_minutes = timedelta_to_kronos_time(self.bucket_width)
    for event in events:
      if event['a'] == 2:
        counts[event['@time'] -
               (event['@time'] % grouping_minutes)] += event['b']
    for group_time in sorted(counts.iterkeys()):
      yield {'@time': group_time, 'b_sum': counts[group_time]}

  def verify_results(self, result_func, cache, expected_results,
                     expected_computations):
    with patch.object(cache, '_compute_bucket',
                      wraps=cache._compute_bucket) as mock_method:
      results = result_func()
      self.assertEqual(mock_method.call_count, expected_computations)

    self.assertEqual(len(results), expected_results)
    result_time = self.start_time
    for idx, result in enumerate(results):
      self.assertEqual(result[TIMESTAMP_FIELD],
                       datetime_to_kronos_time(result_time))
      self.assertEqual(
        result['b_sum'],
        sum([2, 7, 12, 17] +
            [idx * 4 * (self.bucket_width.total_seconds() / 60)]))
      result_time += self.bucket_width

  def test_cache_exceptions(self):
    # Bucket width shouldn't be more granular than 1 second.
    def bad_bucket_width():
      return QueryCache(self.client, self.filter_and_sum,
                        self.bucket_width + timedelta(milliseconds=1),
                        self.computed_namespace)
    self.assertRaises(ValueError, bad_bucket_width)

    # start_time and end_time should align to bucket_width boundaries.
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)

    def bad_start_boundary():
      return list(
        cache.retrieve_interval(start_time + timedelta(minutes=1),
                                end_time))
    self.assertRaises(ValueError, bad_start_boundary)

  @compute_cache_test
  def test_cache_layer(self):
    cache = QueryCache(self.client, self.filter_and_sum,
                       self.bucket_width, self.computed_namespace)
    start_time = self.start_time - (self.bucket_width * 3)
    end_time = self.start_time + (self.total_events * self.increment) + (
      self.bucket_width * 3)
    untrusted_time = self.start_time + (
      timedelta(minutes=(self.total_events / 2) - 25))

    # Verify all results were computed correctly.
    self.verify_results(lambda: list(
        cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                untrusted_time)),
                        cache, 25, 31)

    # Verify only trusted results are cached.
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Running the same operations twice should result in the same
    # results as before.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Expanding the time range without caching should also result in the same
    # results
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width)),
      cache, 11, 0)

    # But specifying compute_missing should get all results for the timerange
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time - self.bucket_width,
                                           end_time + self.bucket_width,
                                           compute_missing=True)),
      cache, 25, 19)

    # Overlapping time queries should result in the same
    # results as before, and benefit from the cache.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time -
                                                           self.bucket_width,
                                                           end_time +
                                                           self.bucket_width,
                                                           untrusted_time)),
      cache, 25, 19)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 11, 0)

    # Increasing the trusted time should increase the cached results.
    untrusted_time = untrusted_time + timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 17)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Decreasing trusted time shouldn't remove results.
    untrusted_time = untrusted_time - timedelta(minutes=40)
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 15)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # If there are two cached entries, that cached time should no
    # longer be returned.
    results = list(cache.retrieve_interval(start_time, end_time))
    duplicate_result = dict(results[10])
    duplicate_result['b_sum'] = 0
    self.client.put({cache._scratch_stream: [duplicate_result]},
                    namespace=cache._scratch_namespace)
    self.client.flush()
    safe_results = list(cache.retrieve_interval(start_time, end_time))
    self.assertEqual(results[:10] + results[11:], safe_results)

    # Rerunning the cache/computation should re-cache the corrupted
    # element.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time,
                                                           untrusted_time)),
      cache, 25, 16)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)

    # Forcing computation should generate the same result set.
    self.verify_results(
      lambda: list(cache.compute_and_cache_missing_buckets(
          start_time, end_time, untrusted_time, force_recompute=True)),
      cache, 25, 31)
    self.verify_results(
      lambda: list(cache.retrieve_interval(start_time, end_time)),
      cache, 13, 0)
Пример #38
0
def main(args):
  client1 = KronosClient(args.kronos_url1, namespace=args.namespace1)
  client2 = KronosClient(args.kronos_url2, namespace=args.namespace2)

  if args.streams_file:
    streams = map(lambda s: (s, s), # Use same stream name for both.
                  filter(lambda s: len(s),
                         open(args.streams_file).read().split('\n')))
  else:
    streams = [(args.stream1, args.stream2)]

  for stream1_name, stream2_name in streams:
    if args.num_samples:
      samples = []
      for _ in xrange(args.num_samples):
        start = random.randint(args.start, args.end - args.sample_interval)
        samples.append((start, start + args.sample_interval))
    else:
      samples = [(args.start, args.end)]

    total_stream1 = 0
    extra_stream1 = 0
    total_stream2 = 0
    extra_stream2 = 0

    for start, end in samples:
      stream1 = client1.get(stream1_name, start, end)
      stream2 = client2.get(stream2_name, start, end)

      # Sorting of events with the same timestamp may vary across backends,
      # hence we can't do a simple loop comparison. We need to aggregate all
      # events with the same timestamp from both streams and then compare the
      # two sets.
      stream1_hashes = set()
      stream2_hashes = set()
      current_timestamp = None
      while True:
        event1 = get_next(stream1)
        event2 = get_next(stream2)
        # Are both streams exhausted?
        if not (event1 or event2):
          break
        # Pick the smaller timestamp from the two events.
        min_timestamp = min(event1.get(TIMESTAMP_FIELD, sys.maxint),
                            event2.get(TIMESTAMP_FIELD, sys.maxint))
        if current_timestamp is None:
          current_timestamp = min_timestamp
        # If min_timestamp is greater than current_timestamp, then aggregate
        # stats for current_timestamp and roll over.
        if min_timestamp > current_timestamp:
          total_stream1 += len(stream1_hashes)
          total_stream2 += len(stream2_hashes)
          extra_stream1 += len(stream1_hashes - stream2_hashes)
          extra_stream2 += len(stream2_hashes - stream1_hashes)
          stream1_hashes.clear()
          stream2_hashes.clear()
          current_timestamp = min_timestamp
    
        if event1:
          assert event1[TIMESTAMP_FIELD] >= current_timestamp
          if event1[TIMESTAMP_FIELD] == current_timestamp:
            del event1[ID_FIELD]
            stream1_hashes.add(
              hashlib.sha224(json.dumps(event1, sort_keys=True)).hexdigest())
          else:
            stream1 = push_back(event1, stream1)

        if event2:
          assert event2[TIMESTAMP_FIELD] >= current_timestamp
          if event2[TIMESTAMP_FIELD] == current_timestamp:
            del event2[ID_FIELD]
            stream2_hashes.add(
              hashlib.sha224(json.dumps(event2, sort_keys=True)).hexdigest())
          else:
           stream2 = push_back(event2, stream2)

    print 'Diff: [%s/%s], [%s/%s]' % (args.namespace1, stream1_name,
                                      args.namespace2, stream2_name)
    print '< total: %d' % total_stream1
    print '> total: %d' % total_stream2
    print '< extra: %d' % extra_stream1
    print '> extra: %d' % extra_stream2
    print