Exemplo n.º 1
0
 def test_filter(self):
     for i in xrange(2000):
         event = {
             constants.TIMESTAMP_FIELD: random.randint(0, 999),
             'a': random.randint(0, 10),
             'b': random.randint(50, 150),
             'c': [random.randint(0, 20) for j in xrange(10)]
         }
         if random.randint(0, 100) > 50:
             event['d'] = 'iamlolcat'
         else:
             event['d'] = 'helloworld'
         self.kronos_client.put({self.stream: [event]})
     events = self.query(
         Filter(KronosSource('kronos', self.stream, 0, 1000), (
             (Condition(Condition.Op.GT, Property(
                 constants.TIMESTAMP_FIELD), Constant(500))
              | Condition(Condition.Op.LTE, Property('b'), Constant(100))) &
             (Condition(Condition.Op.CONTAINS, Property('c'), Property('a'))
              | Condition(Condition.Op.REGEX, Property('d'),
                          Constant('lolcat'))))).to_dict())
     self.assertTrue(len(events) > 0)
     self.assertTrue(len(events) < 2000)
     for event in events:
         self.assertTrue(event[constants.TIMESTAMP_FIELD] > 500
                         or event['b'] <= 100)
         self.assertTrue(event['a'] in event['c'] or 'lolcat' in event['d'])
         self.assertTrue(event[constants.TIMESTAMP_FIELD] >= 0)
         self.assertTrue(event[constants.TIMESTAMP_FIELD] < 1000)
Exemplo n.º 2
0
 def test_parsing_and_to_dict(self):
   _property = {'type': 'property', 'name': 'x'}
   constant = {'type': 'constant', 'value': 1}
   function = {'type': 'function', 'name': 'add',
               'arguments': [deepcopy(_property), deepcopy(constant)]}
   self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant)
   self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property)
   self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function)
   
   kronos = {'type': 'kronos', 'host': 'localhost', 'stream': 'mystream',
             'start_time': 100, 'end_time': 200}
   self.assertEqual(Stream.parse(deepcopy(kronos)).to_dict(), kronos)
   
   condition_lt = {'op': 'lt', 'left': deepcopy(_property),
                   'right': deepcopy(constant)}
   condition_eq = {'op': 'eq', 'left': deepcopy(function),
                   'right': deepcopy(_property)}
   condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt),
                                                deepcopy(condition_eq)]}
   self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(),
                    condition_lt)
   self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(),
                    condition_eq)
   self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(),
                    condition_or)
   
   avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'}
   count = {'op': 'count', 'alias': 'mycount'}
   self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg)
   self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count)
   
   group_by = deepcopy(function)
   group_by['alias'] = 'mygroup'
   group_by = [group_by]
   self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by)
   
   project = {'type': 'project', 'fields': [deepcopy(_property)],
              'stream': deepcopy(kronos)}
   _filter = {'type': 'filter', 'condition': condition_lt,
             'stream': deepcopy(project)}
   aggregate = {'type': 'aggregate',
                'group_by': deepcopy(group_by),
                'aggregates': [deepcopy(avg), deepcopy(count)],
                'stream': deepcopy(_filter)}
   join = {'type': 'join', 'left': deepcopy(aggregate),
           'right': deepcopy(project), 'condition': deepcopy(condition_or)}
   self.assertEqual(Transform.parse(deepcopy(project)).to_dict(), project)
   self.assertEqual(Transform.parse(deepcopy(_filter)).to_dict(), _filter)
   self.assertEqual(Transform.parse(deepcopy(aggregate)).to_dict(), aggregate)
   self.assertEqual(Transform.parse(deepcopy(join)).to_dict(), join)
Exemplo n.º 3
0
  def test_parsing_and_to_dict(self):
    _property = {'type': 'property', 'name': 'x'}
    constant = {'type': 'constant', 'value': 1}
    function = {'type': 'function', 'name': 'add',
                'arguments': [deepcopy(_property), deepcopy(constant)]}
    self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant)
    self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property)
    self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function)

    kronos = {'type': 'data_access', 'source': 'kronos', 'stream': 'mystream',
              'start_time': 100, 'end_time': 200}
    self.assertEqual(Operator.parse(deepcopy(kronos)).to_dict(), kronos)

    condition_lt = {'op': 'lt', 'left': deepcopy(_property),
                    'right': deepcopy(constant)}
    condition_eq = {'op': 'eq', 'left': deepcopy(function),
                    'right': deepcopy(_property)}
    condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt),
                                                 deepcopy(condition_eq)]}
    self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(),
                     condition_lt)
    self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(),
                     condition_eq)
    self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(),
                     condition_or)

    avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'}
    count = {'op': 'count', 'alias': 'mycount'}
    self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg)
    self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count)

    group_by = deepcopy(function)
    group_by['alias'] = 'mygroup'
    group_by = [group_by]
    self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by)

    project = {'type': 'project', 'fields': [deepcopy(_property)],
               'source': deepcopy(kronos)}
    _filter = {'type': 'filter', 'condition': condition_lt,
               'source': deepcopy(project)}
    aggregate = {'type': 'aggregate',
                 'group_by': deepcopy(group_by),
                 'aggregates': [deepcopy(avg), deepcopy(count)],
                 'source': deepcopy(_filter)}
    join = {'type': 'join', 'left': deepcopy(aggregate),
            'right': deepcopy(project), 'condition': deepcopy(condition_or)}
    self.assertEqual(Operator.parse(deepcopy(project)).to_dict(), project)
    self.assertEqual(Operator.parse(deepcopy(_filter)).to_dict(), _filter)
    self.assertEqual(Operator.parse(deepcopy(aggregate)).to_dict(), aggregate)
    self.assertEqual(Operator.parse(deepcopy(join)).to_dict(), join)
Exemplo n.º 4
0
 def test_join(self):
   for i in xrange(100):
     self.kronos_client.put({
       self.stream + '1': [{constants.TIMESTAMP_FIELD: i,
                            'a': random.randint(0, 2),
                            'b': random.randint(0, 5)}]
     })
   for i in xrange(100):
     self.kronos_client.put({
       self.stream + '2': [{constants.TIMESTAMP_FIELD: i,
                            'a': random.randint(0, 2),
                            'b': random.randint(0, 5)}]
     })
   events = self.query(Join(KronosStream('http://localhost:9191',
                                         self.stream + '1',
                                         0,
                                         200,
                                         alias='j1'),
                            KronosStream('http://localhost:9191',
                                         self.stream + '2',
                                         0,
                                         200),
                            (Condition(Condition.Op.EQ,
                                       Property('j1.a'),
                                       Property('right.a')) &
                             Condition(Condition.Op.GT,
                                       Property('j1.b'),
                                       Property('right.b')))).to_dict())
   self.assertTrue(len(events) > 0)
   for event in events:
     self.assertEqual(event['j1.a'], event['right.a'])
     self.assertTrue(event['j1.b'] > event['right.b'])
     self.assertEqual(set(event),
                      {'j1.%s' % constants.TIMESTAMP_FIELD,
                       'right.%s' % constants.TIMESTAMP_FIELD,
                       'j1.%s' % constants.ID_FIELD,
                       'right.%s' % constants.ID_FIELD,
                       'j1.a', 'right.a',
                       'j1.b', 'right.b'})
Exemplo n.º 5
0
def cohort_queryplan(plan):
  """
  Input:
  {
   'source': 'kronos', # Name of data source from settings
   'cohort':
    {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from.
     'transform': lambda x: x,          # Transformations on the kstream.
     'start': date.now(),               # The day of the first cohort.
     'unit': DateUnit.XX,               # Users are in the same cohort
                                        # if they are in the same day/week.
     'cohorts': 5                       # How many cohorts (days/weeks/months)
                                        # to track.
     'grouping_key': 'user'},           # What key in an event should we tie
                                        # to a key in the action stream?

   'action':
     {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on.
      'transform': lambda x: x               # Transformations on the stream.
      'unit': DateUnit.XX,                   # Track events in day/week/months.
      'repetitions': 14                   # How many days/weeks/months to track.
      'grouping_key': 'user_id'}          # What key in an event should we tie
                                          # to a key in the action stream?
  }

  Output:
  A metis-compatible query plan to return a cohort analysis.
  """
  cohort = plan['cohort']
  action = plan['action']
  source = plan['source']

  # Calculate the start and end dates, in Kronos time, of the
  # beginning and end of the cohort and action streams that will be
  # relevant.
  cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start']))
  cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']})
  cohort_end = cohort['start'] + cohort_span
  action_span = timedelta(**{action['unit']: action['repetitions']})
  action_end = cohort_end + action_span
  cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1
  action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1

  left = _cohort_stream_transform(source,
                                  cohort['stream'], cohort_start, cohort_end,
                                  cohort.get('transform'),
                                  cohort['grouping_key'], cohort['unit'])
  right = _cohort_stream_transform(source,
                                   action['stream'], cohort_start, action_end,
                                   action.get('transform'),
                                   action['grouping_key'], action['unit'])

  additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) *
                            action['repetitions'])

  left.alias = 'cohort'
  right.alias = 'action'

  joined = Join(left,
                right,
                (Condition(Condition.Op.EQ,
                           Property('cohort.%s' % cohort['grouping_key']),
                           Property('action.%s' % action['grouping_key'])) &
                 Condition(Condition.Op.GTE,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Property('cohort.%s' % TIMESTAMP_FIELD)) &
                 Condition(Condition.Op.LT,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Add([Property('cohort.%s' % TIMESTAMP_FIELD),
                                Constant(additional_action_time)]))))

  user_aggregated = Aggregate(
    joined,
    GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD),
             Property('cohort.%s' % cohort['grouping_key'], alias='group'),
             Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD),
                              Property('cohort.%s' % TIMESTAMP_FIELD)]),
                    Constant(DateUnit.unit_to_kronos_time(action['unit']))],
                   alias='action_step')]),
    [Count([], alias='count')]
  )

  aggregated = Aggregate(
    user_aggregated,
    GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD),
             Property('action_step', alias='action_step')]),
    [Count([], alias='cohort_actions')])

  # TODO(marcua): Also sum up the cohort sizes, join with the plan.
  return aggregated.to_dict()
Exemplo n.º 6
0
 def parse(self, _dict):
     _dict['condition'] = Condition.parse(_dict['condition'])
     return Filter(**_dict)
Exemplo n.º 7
0
 def parse(self, _dict):
     _dict['left'] = _parse_stream_or_transform(_dict['left'])
     _dict['right'] = _parse_stream_or_transform(_dict['right'])
     _dict['condition'] = Condition.parse(_dict['condition'])
     return Join(**_dict)
Exemplo n.º 8
0
 def parse(self, _dict):
   _dict['left'] = Operator.parse(_dict['left'])
   _dict['right'] = Operator.parse(_dict['right'])
   _dict['condition'] = Condition.parse(_dict['condition'])
   return Join(**_dict)
Exemplo n.º 9
0
 def parse(self, _dict):
   _dict['condition'] = Condition.parse(_dict['condition'])
   return Filter(**_dict)
Exemplo n.º 10
0
def filter(query_plan, operands):
    condition = Condition(operands['op'], cpf(operands['lhs']),
                          cpf(operands['rhs']))
    return Filter(query_plan, condition)
Exemplo n.º 11
0
  def test_join_eq(self):
    for i in xrange(200):
      self.kronos_client.put({
        self.stream + '1': [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                             'a': i,
                             'b': i + 1}]
      })
    for i in xrange(200):
      self.kronos_client.put({
        self.stream + '2': [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                             'a': i + 1,
                             'b': i + 2}]
      })

    # 1-1 join with property.
    events = self.query(Join(KronosStream('http://localhost:9191',
                                          self.stream + '1',
                                          0,
                                          1000),
                             KronosStream('http://localhost:9191',
                                          self.stream + '2',
                                          0,
                                          1000),
                             # left.a == right.b
                             Condition(Condition.Op.EQ,
                                       Property('left.b'),
                                       Property('right.a'))).to_dict())
    self.assertEqual(len(events), 200)
    for event in events:
      self.assertEqual(event['left.b'], event['right.a'])

    # 1-1 join with function.
    events = self.query(Join(KronosStream('http://localhost:9191',
                                          self.stream + '1',
                                          0,
                                          1000),
                             KronosStream('http://localhost:9191',
                                          self.stream + '2',
                                          0,
                                          1000),
                             # left.a == (right.a - 1)
                             Condition(Condition.Op.EQ,
                                       Property('left.a'),
                                       Subtract([Property('right.a'),
                                                 Constant(1)]))).to_dict())
    self.assertEqual(len(events), 200)
    for event in events:
      self.assertEqual(event['left.a'], event['right.a'] - 1)

    # 1-1 eqjoin with filtering.
    events = self.query(
      Join(KronosStream('http://localhost:9191',
                        self.stream + '1',
                        0,
                        1000),
           KronosStream('http://localhost:9191',
                        self.stream + '2',
                        0,
                        1000),
           (Condition(Condition.Op.EQ,
                      Property('left.b'),
                      Property('right.a')) &
            Condition(Condition.Op.GT,
                      Property('left.%s' %
                               constants.TIMESTAMP_FIELD),
                      Add([Property('right.%s' %
                                    constants.TIMESTAMP_FIELD),
                           Constant(10)])))).to_dict())
    self.assertTrue(len(events) > 0)
    self.assertTrue(len(events) < 200)
    for event in events:
      self.assertEqual(event['left.b'], event['right.a'])
      self.assertTrue(event['left.%s' % constants.TIMESTAMP_FIELD] >
                      event['right.%s' % constants.TIMESTAMP_FIELD] + 10)
Exemplo n.º 12
0
 def parse(self, _dict):
   _dict['left'] = _parse_stream_or_transform(_dict['left'])
   _dict['right'] = _parse_stream_or_transform(_dict['right'])
   _dict['condition'] = Condition.parse(_dict['condition'])
   return Join(**_dict)