예제 #1
0
 def test_project(self):
   for i in xrange(25):
     self.kronos_client.put({
       self.stream: [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                      'i': i,
                      'i+1': i + 1}]
     })
   events = self.query(Project(KronosStream('http://localhost:9191',
                                            self.stream,
                                            0,
                                            1000),
                               [Property('i', alias='I'),
                                Constant(10, alias='const'),
                                Add([Property('i'), Property('i+1'),
                                     Constant(5)],
                                    alias='func')],
                               merge=True).to_dict())
   self.assertEqual(len(events), 25)
   for event in events:
     self.assertEqual(event['i'], event['I'])
     self.assertEqual(event['const'], 10)
     self.assertEqual(event['func'], event['i'] * 2 + 6)
     self.assertEqual(event['i+1'], event['i'] + 1)
     self.assertTrue(event[constants.TIMESTAMP_FIELD] >= 0)
     self.assertTrue(event[constants.TIMESTAMP_FIELD] < 1000)
예제 #2
0
 def test_filter(self):
     for i in xrange(2000):
         event = {
             constants.TIMESTAMP_FIELD: random.randint(0, 999),
             'a': random.randint(0, 10),
             'b': random.randint(50, 150),
             'c': [random.randint(0, 20) for j in xrange(10)]
         }
         if random.randint(0, 100) > 50:
             event['d'] = 'iamlolcat'
         else:
             event['d'] = 'helloworld'
         self.kronos_client.put({self.stream: [event]})
     events = self.query(
         Filter(KronosSource('kronos', self.stream, 0, 1000), (
             (Condition(Condition.Op.GT, Property(
                 constants.TIMESTAMP_FIELD), Constant(500))
              | Condition(Condition.Op.LTE, Property('b'), Constant(100))) &
             (Condition(Condition.Op.CONTAINS, Property('c'), Property('a'))
              | Condition(Condition.Op.REGEX, Property('d'),
                          Constant('lolcat'))))).to_dict())
     self.assertTrue(len(events) > 0)
     self.assertTrue(len(events) < 2000)
     for event in events:
         self.assertTrue(event[constants.TIMESTAMP_FIELD] > 500
                         or event['b'] <= 100)
         self.assertTrue(event['a'] in event['c'] or 'lolcat' in event['d'])
         self.assertTrue(event[constants.TIMESTAMP_FIELD] >= 0)
         self.assertTrue(event[constants.TIMESTAMP_FIELD] < 1000)
예제 #3
0
 def test_get_properties_accessed_by_value(self):
     self.assertEqual(get_properties_accessed_by_value(Property('lolcat')),
                      ['lolcat'])
     self.assertEqual(get_properties_accessed_by_value(Constant(0)), [])
     self.assertEqual(
         get_properties_accessed_by_value(
             Add([
                 Property('lolcat'),
                 Constant(1),
                 Add([Property('hello'), Constant(2)])
             ])), ['lolcat', 'hello'])
예제 #4
0
  def test_order_by(self):
    for i in xrange(100):
      self.kronos_client.put({
        self.stream: [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                       'a': random.randint(0, 5),
                       'b': random.randint(1000, 10000)}]
      })
    # NOP projection to ensure events flow through Spark.
    events = self.query(Project(KronosStream('http://localhost:9191',
                                             self.stream,
                                             0,
                                             1000),
                                [Property('a', alias='a')],
                                merge=True).to_dict())
    # By default, should be ordered by time.
    self.assertEqual(len(events), 100)
    times = [event[constants.TIMESTAMP_FIELD] for event in events]
    self.assertEqual(times, sorted(times))

    # ResultOrder.ASCENDING should put events in ascending order 
    events = self.query(OrderBy(KronosStream('http://localhost:9191',
                                             self.stream,
                                             0,
                                             1000),
                                [Property('a'), Property('b')],
                                OrderBy.ResultOrder.ASCENDING).to_dict())
    self.assertEqual(len(events), 100)
    a = b = -float('inf')
    for event in events:
      if a != event['a']:
        b = -float('inf')
      self.assertTrue(a <= event['a'])
      self.assertTrue(b <= event['b'])
      a = event['a']
      b = event['b']

    # Test that ResultOrder.ASCENDING is default
    events2 = self.query(OrderBy(KronosStream('http://localhost:9191',
                                              self.stream,
                                              0,
                                              1000),
                                 [Property('a'), Property('b')]).to_dict())
    self.assertEqual(events, events2)

    # ResultOrder.DESCENDING should put events in descending order
    events = self.query(OrderBy(KronosStream('http://localhost:9191',
                                             self.stream,
                                             0,
                                             1000),
                                [Property('a'), Property('b')],
                                OrderBy.ResultOrder.DESCENDING).to_dict())
    self.assertEqual(len(events), 100)
    a = b = float('inf')
    for event in events:
      if a != event['a']:
        b = float('inf')
      self.assertTrue(a >= event['a'])
      self.assertTrue(b >= event['b'])
      a = event['a']
      b = event['b']
예제 #5
0
def _cohort_stream_transform(source, stream, start, end,
                             transform, grouping_key, unit):
  start_stream = KronosSource(source, stream, start, end)
  if transform:
    transformed = transform(start_stream)
  else:
    transformed = start_stream
  projected = Project(transformed,
                      [Property(TIMESTAMP_FIELD,
                                alias=TIMESTAMP_FIELD),
                       Property(grouping_key,
                                alias=grouping_key),
                       Floor([Property(TIMESTAMP_FIELD),
                              Constant(DateUnit.unit_to_kronos_time(unit)),
                              Constant(start)],
                             alias='date')])
  # This leaves us with a single event per (user, unit time) pair.
  aggregated = Aggregate(
    projected,
    GroupBy([Property(grouping_key,
                      alias=grouping_key),
             Property('date',
                      alias='date')]),
    # The first time the user performed the event in that bucket.
    [Min([Property(TIMESTAMP_FIELD)], alias=TIMESTAMP_FIELD)])
  return aggregated
예제 #6
0
 def test_join(self):
   for i in xrange(100):
     self.kronos_client.put({
       self.stream + '1': [{constants.TIMESTAMP_FIELD: i,
                            'a': random.randint(0, 2),
                            'b': random.randint(0, 5)}]
     })
   for i in xrange(100):
     self.kronos_client.put({
       self.stream + '2': [{constants.TIMESTAMP_FIELD: i,
                            'a': random.randint(0, 2),
                            'b': random.randint(0, 5)}]
     })
   events = self.query(Join(KronosStream('http://localhost:9191',
                                         self.stream + '1',
                                         0,
                                         200,
                                         alias='j1'),
                            KronosStream('http://localhost:9191',
                                         self.stream + '2',
                                         0,
                                         200),
                            (Condition(Condition.Op.EQ,
                                       Property('j1.a'),
                                       Property('right.a')) &
                             Condition(Condition.Op.GT,
                                       Property('j1.b'),
                                       Property('right.b')))).to_dict())
   self.assertTrue(len(events) > 0)
   for event in events:
     self.assertEqual(event['j1.a'], event['right.a'])
     self.assertTrue(event['j1.b'] > event['right.b'])
     self.assertEqual(set(event),
                      {'j1.%s' % constants.TIMESTAMP_FIELD,
                       'right.%s' % constants.TIMESTAMP_FIELD,
                       'j1.%s' % constants.ID_FIELD,
                       'right.%s' % constants.ID_FIELD,
                       'j1.a', 'right.a',
                       'j1.b', 'right.b'})
예제 #7
0
def cpf(args, alias=None):
    if args['cpf_type'] == 'constant':
        try:
            constant = float(args['constant_value'])
        except:
            constant = args['constant_value']
        return Constant(constant, alias=alias)
    elif args['cpf_type'] == 'property':
        return Property(args['property_name'], alias=alias)
    elif args['cpf_type'] == 'function':
        for i in range(len(args['function_args'])):
            args['function_args'][i] = cpf(args['function_args'][i])
        module = metis.core.query.value
        func = args['function_name']
        func_args = args['function_args']
        return getattr(module, func)(func_args, alias=alias)
    else:
        raise ValueError("cpf_type must be constant, property, or function")
예제 #8
0
    def test_aggregate(self):
        sums = defaultdict(int)
        for i in xrange(200):
            a = random.randint(0, 2)
            self.kronos_client.put(
                {self.stream: [{
                    constants.TIMESTAMP_FIELD: i,
                    'a': a
                }]})
            sums[50 * (i / 50)] += a
        events = self.query(
            Aggregate(
                Project(KronosSource('kronos', self.stream, 0, 1000), [
                    Floor([Property(constants.TIMESTAMP_FIELD),
                           Constant(50)],
                          alias=constants.TIMESTAMP_FIELD)
                ],
                        merge=True),
                GroupBy(
                    Property(constants.TIMESTAMP_FIELD,
                             alias=constants.TIMESTAMP_FIELD)), [
                                 Count([], alias='count'),
                                 Sum([Property('a')], alias='sum'),
                                 Min([Property('a')], alias='min'),
                                 Max([Property('a')], alias='max'),
                                 Avg([Property('a')], alias='avg')
                             ]).to_dict())
        self.assertEqual(len(events), 200 / 50)
        for event in events:
            self.assertEqual(event[constants.TIMESTAMP_FIELD] % 50, 0)
            self.assertEqual(event['count'], 50)
            self.assertEqual(event['min'], 0)
            self.assertEqual(event['max'], 2)
            self.assertEqual(event['sum'],
                             sums[event[constants.TIMESTAMP_FIELD]])
            self.assertTrue(event['avg'] * 50 > event['sum'] - 0.1)
            self.assertTrue(event['avg'] * 50 < event['sum'] + 0.1)

        events = self.query(
            Aggregate(
                KronosSource('kronos', self.stream, 0, 1000),
                GroupBy(
                    Floor([Property(constants.TIMESTAMP_FIELD),
                           Constant(50)],
                          alias=constants.TIMESTAMP_FIELD)),
                [Count([], alias='count')]).to_dict())
        self.assertEqual(len(events), 200 / 50)
예제 #9
0
def cohort_queryplan(plan):
  """
  Input:
  {
   'source': 'kronos', # Name of data source from settings
   'cohort':
    {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from.
     'transform': lambda x: x,          # Transformations on the kstream.
     'start': date.now(),               # The day of the first cohort.
     'unit': DateUnit.XX,               # Users are in the same cohort
                                        # if they are in the same day/week.
     'cohorts': 5                       # How many cohorts (days/weeks/months)
                                        # to track.
     'grouping_key': 'user'},           # What key in an event should we tie
                                        # to a key in the action stream?

   'action':
     {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on.
      'transform': lambda x: x               # Transformations on the stream.
      'unit': DateUnit.XX,                   # Track events in day/week/months.
      'repetitions': 14                   # How many days/weeks/months to track.
      'grouping_key': 'user_id'}          # What key in an event should we tie
                                          # to a key in the action stream?
  }

  Output:
  A metis-compatible query plan to return a cohort analysis.
  """
  cohort = plan['cohort']
  action = plan['action']
  source = plan['source']

  # Calculate the start and end dates, in Kronos time, of the
  # beginning and end of the cohort and action streams that will be
  # relevant.
  cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start']))
  cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']})
  cohort_end = cohort['start'] + cohort_span
  action_span = timedelta(**{action['unit']: action['repetitions']})
  action_end = cohort_end + action_span
  cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1
  action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1

  left = _cohort_stream_transform(source,
                                  cohort['stream'], cohort_start, cohort_end,
                                  cohort.get('transform'),
                                  cohort['grouping_key'], cohort['unit'])
  right = _cohort_stream_transform(source,
                                   action['stream'], cohort_start, action_end,
                                   action.get('transform'),
                                   action['grouping_key'], action['unit'])

  additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) *
                            action['repetitions'])

  left.alias = 'cohort'
  right.alias = 'action'

  joined = Join(left,
                right,
                (Condition(Condition.Op.EQ,
                           Property('cohort.%s' % cohort['grouping_key']),
                           Property('action.%s' % action['grouping_key'])) &
                 Condition(Condition.Op.GTE,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Property('cohort.%s' % TIMESTAMP_FIELD)) &
                 Condition(Condition.Op.LT,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Add([Property('cohort.%s' % TIMESTAMP_FIELD),
                                Constant(additional_action_time)]))))

  user_aggregated = Aggregate(
    joined,
    GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD),
             Property('cohort.%s' % cohort['grouping_key'], alias='group'),
             Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD),
                              Property('cohort.%s' % TIMESTAMP_FIELD)]),
                    Constant(DateUnit.unit_to_kronos_time(action['unit']))],
                   alias='action_step')]),
    [Count([], alias='count')]
  )

  aggregated = Aggregate(
    user_aggregated,
    GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD),
             Property('action_step', alias='action_step')]),
    [Count([], alias='cohort_actions')])

  # TODO(marcua): Also sum up the cohort sizes, join with the plan.
  return aggregated.to_dict()
예제 #10
0
  def test_join_eq(self):
    for i in xrange(200):
      self.kronos_client.put({
        self.stream + '1': [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                             'a': i,
                             'b': i + 1}]
      })
    for i in xrange(200):
      self.kronos_client.put({
        self.stream + '2': [{constants.TIMESTAMP_FIELD: random.randint(0, 999),
                             'a': i + 1,
                             'b': i + 2}]
      })

    # 1-1 join with property.
    events = self.query(Join(KronosStream('http://localhost:9191',
                                          self.stream + '1',
                                          0,
                                          1000),
                             KronosStream('http://localhost:9191',
                                          self.stream + '2',
                                          0,
                                          1000),
                             # left.a == right.b
                             Condition(Condition.Op.EQ,
                                       Property('left.b'),
                                       Property('right.a'))).to_dict())
    self.assertEqual(len(events), 200)
    for event in events:
      self.assertEqual(event['left.b'], event['right.a'])

    # 1-1 join with function.
    events = self.query(Join(KronosStream('http://localhost:9191',
                                          self.stream + '1',
                                          0,
                                          1000),
                             KronosStream('http://localhost:9191',
                                          self.stream + '2',
                                          0,
                                          1000),
                             # left.a == (right.a - 1)
                             Condition(Condition.Op.EQ,
                                       Property('left.a'),
                                       Subtract([Property('right.a'),
                                                 Constant(1)]))).to_dict())
    self.assertEqual(len(events), 200)
    for event in events:
      self.assertEqual(event['left.a'], event['right.a'] - 1)

    # 1-1 eqjoin with filtering.
    events = self.query(
      Join(KronosStream('http://localhost:9191',
                        self.stream + '1',
                        0,
                        1000),
           KronosStream('http://localhost:9191',
                        self.stream + '2',
                        0,
                        1000),
           (Condition(Condition.Op.EQ,
                      Property('left.b'),
                      Property('right.a')) &
            Condition(Condition.Op.GT,
                      Property('left.%s' %
                               constants.TIMESTAMP_FIELD),
                      Add([Property('right.%s' %
                                    constants.TIMESTAMP_FIELD),
                           Constant(10)])))).to_dict())
    self.assertTrue(len(events) > 0)
    self.assertTrue(len(events) < 200)
    for event in events:
      self.assertEqual(event['left.b'], event['right.a'])
      self.assertTrue(event['left.%s' % constants.TIMESTAMP_FIELD] >
                      event['right.%s' % constants.TIMESTAMP_FIELD] + 10)