def test_project(self): for i in xrange(25): self.kronos_client.put({ self.stream: [{constants.TIMESTAMP_FIELD: random.randint(0, 999), 'i': i, 'i+1': i + 1}] }) events = self.query(Project(KronosStream('http://localhost:9191', self.stream, 0, 1000), [Property('i', alias='I'), Constant(10, alias='const'), Add([Property('i'), Property('i+1'), Constant(5)], alias='func')], merge=True).to_dict()) self.assertEqual(len(events), 25) for event in events: self.assertEqual(event['i'], event['I']) self.assertEqual(event['const'], 10) self.assertEqual(event['func'], event['i'] * 2 + 6) self.assertEqual(event['i+1'], event['i'] + 1) self.assertTrue(event[constants.TIMESTAMP_FIELD] >= 0) self.assertTrue(event[constants.TIMESTAMP_FIELD] < 1000)
def test_filter(self): for i in xrange(2000): event = { constants.TIMESTAMP_FIELD: random.randint(0, 999), 'a': random.randint(0, 10), 'b': random.randint(50, 150), 'c': [random.randint(0, 20) for j in xrange(10)] } if random.randint(0, 100) > 50: event['d'] = 'iamlolcat' else: event['d'] = 'helloworld' self.kronos_client.put({self.stream: [event]}) events = self.query( Filter(KronosSource('kronos', self.stream, 0, 1000), ( (Condition(Condition.Op.GT, Property( constants.TIMESTAMP_FIELD), Constant(500)) | Condition(Condition.Op.LTE, Property('b'), Constant(100))) & (Condition(Condition.Op.CONTAINS, Property('c'), Property('a')) | Condition(Condition.Op.REGEX, Property('d'), Constant('lolcat'))))).to_dict()) self.assertTrue(len(events) > 0) self.assertTrue(len(events) < 2000) for event in events: self.assertTrue(event[constants.TIMESTAMP_FIELD] > 500 or event['b'] <= 100) self.assertTrue(event['a'] in event['c'] or 'lolcat' in event['d']) self.assertTrue(event[constants.TIMESTAMP_FIELD] >= 0) self.assertTrue(event[constants.TIMESTAMP_FIELD] < 1000)
def test_get_properties_accessed_by_value(self): self.assertEqual(get_properties_accessed_by_value(Property('lolcat')), ['lolcat']) self.assertEqual(get_properties_accessed_by_value(Constant(0)), []) self.assertEqual( get_properties_accessed_by_value( Add([ Property('lolcat'), Constant(1), Add([Property('hello'), Constant(2)]) ])), ['lolcat', 'hello'])
def test_order_by(self): for i in xrange(100): self.kronos_client.put({ self.stream: [{constants.TIMESTAMP_FIELD: random.randint(0, 999), 'a': random.randint(0, 5), 'b': random.randint(1000, 10000)}] }) # NOP projection to ensure events flow through Spark. events = self.query(Project(KronosStream('http://localhost:9191', self.stream, 0, 1000), [Property('a', alias='a')], merge=True).to_dict()) # By default, should be ordered by time. self.assertEqual(len(events), 100) times = [event[constants.TIMESTAMP_FIELD] for event in events] self.assertEqual(times, sorted(times)) # ResultOrder.ASCENDING should put events in ascending order events = self.query(OrderBy(KronosStream('http://localhost:9191', self.stream, 0, 1000), [Property('a'), Property('b')], OrderBy.ResultOrder.ASCENDING).to_dict()) self.assertEqual(len(events), 100) a = b = -float('inf') for event in events: if a != event['a']: b = -float('inf') self.assertTrue(a <= event['a']) self.assertTrue(b <= event['b']) a = event['a'] b = event['b'] # Test that ResultOrder.ASCENDING is default events2 = self.query(OrderBy(KronosStream('http://localhost:9191', self.stream, 0, 1000), [Property('a'), Property('b')]).to_dict()) self.assertEqual(events, events2) # ResultOrder.DESCENDING should put events in descending order events = self.query(OrderBy(KronosStream('http://localhost:9191', self.stream, 0, 1000), [Property('a'), Property('b')], OrderBy.ResultOrder.DESCENDING).to_dict()) self.assertEqual(len(events), 100) a = b = float('inf') for event in events: if a != event['a']: b = float('inf') self.assertTrue(a >= event['a']) self.assertTrue(b >= event['b']) a = event['a'] b = event['b']
def _cohort_stream_transform(source, stream, start, end, transform, grouping_key, unit): start_stream = KronosSource(source, stream, start, end) if transform: transformed = transform(start_stream) else: transformed = start_stream projected = Project(transformed, [Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD), Property(grouping_key, alias=grouping_key), Floor([Property(TIMESTAMP_FIELD), Constant(DateUnit.unit_to_kronos_time(unit)), Constant(start)], alias='date')]) # This leaves us with a single event per (user, unit time) pair. aggregated = Aggregate( projected, GroupBy([Property(grouping_key, alias=grouping_key), Property('date', alias='date')]), # The first time the user performed the event in that bucket. [Min([Property(TIMESTAMP_FIELD)], alias=TIMESTAMP_FIELD)]) return aggregated
def test_join(self): for i in xrange(100): self.kronos_client.put({ self.stream + '1': [{constants.TIMESTAMP_FIELD: i, 'a': random.randint(0, 2), 'b': random.randint(0, 5)}] }) for i in xrange(100): self.kronos_client.put({ self.stream + '2': [{constants.TIMESTAMP_FIELD: i, 'a': random.randint(0, 2), 'b': random.randint(0, 5)}] }) events = self.query(Join(KronosStream('http://localhost:9191', self.stream + '1', 0, 200, alias='j1'), KronosStream('http://localhost:9191', self.stream + '2', 0, 200), (Condition(Condition.Op.EQ, Property('j1.a'), Property('right.a')) & Condition(Condition.Op.GT, Property('j1.b'), Property('right.b')))).to_dict()) self.assertTrue(len(events) > 0) for event in events: self.assertEqual(event['j1.a'], event['right.a']) self.assertTrue(event['j1.b'] > event['right.b']) self.assertEqual(set(event), {'j1.%s' % constants.TIMESTAMP_FIELD, 'right.%s' % constants.TIMESTAMP_FIELD, 'j1.%s' % constants.ID_FIELD, 'right.%s' % constants.ID_FIELD, 'j1.a', 'right.a', 'j1.b', 'right.b'})
def cpf(args, alias=None): if args['cpf_type'] == 'constant': try: constant = float(args['constant_value']) except: constant = args['constant_value'] return Constant(constant, alias=alias) elif args['cpf_type'] == 'property': return Property(args['property_name'], alias=alias) elif args['cpf_type'] == 'function': for i in range(len(args['function_args'])): args['function_args'][i] = cpf(args['function_args'][i]) module = metis.core.query.value func = args['function_name'] func_args = args['function_args'] return getattr(module, func)(func_args, alias=alias) else: raise ValueError("cpf_type must be constant, property, or function")
def test_aggregate(self): sums = defaultdict(int) for i in xrange(200): a = random.randint(0, 2) self.kronos_client.put( {self.stream: [{ constants.TIMESTAMP_FIELD: i, 'a': a }]}) sums[50 * (i / 50)] += a events = self.query( Aggregate( Project(KronosSource('kronos', self.stream, 0, 1000), [ Floor([Property(constants.TIMESTAMP_FIELD), Constant(50)], alias=constants.TIMESTAMP_FIELD) ], merge=True), GroupBy( Property(constants.TIMESTAMP_FIELD, alias=constants.TIMESTAMP_FIELD)), [ Count([], alias='count'), Sum([Property('a')], alias='sum'), Min([Property('a')], alias='min'), Max([Property('a')], alias='max'), Avg([Property('a')], alias='avg') ]).to_dict()) self.assertEqual(len(events), 200 / 50) for event in events: self.assertEqual(event[constants.TIMESTAMP_FIELD] % 50, 0) self.assertEqual(event['count'], 50) self.assertEqual(event['min'], 0) self.assertEqual(event['max'], 2) self.assertEqual(event['sum'], sums[event[constants.TIMESTAMP_FIELD]]) self.assertTrue(event['avg'] * 50 > event['sum'] - 0.1) self.assertTrue(event['avg'] * 50 < event['sum'] + 0.1) events = self.query( Aggregate( KronosSource('kronos', self.stream, 0, 1000), GroupBy( Floor([Property(constants.TIMESTAMP_FIELD), Constant(50)], alias=constants.TIMESTAMP_FIELD)), [Count([], alias='count')]).to_dict()) self.assertEqual(len(events), 200 / 50)
def cohort_queryplan(plan): """ Input: { 'source': 'kronos', # Name of data source from settings 'cohort': {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from. 'transform': lambda x: x, # Transformations on the kstream. 'start': date.now(), # The day of the first cohort. 'unit': DateUnit.XX, # Users are in the same cohort # if they are in the same day/week. 'cohorts': 5 # How many cohorts (days/weeks/months) # to track. 'grouping_key': 'user'}, # What key in an event should we tie # to a key in the action stream? 'action': {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on. 'transform': lambda x: x # Transformations on the stream. 'unit': DateUnit.XX, # Track events in day/week/months. 'repetitions': 14 # How many days/weeks/months to track. 'grouping_key': 'user_id'} # What key in an event should we tie # to a key in the action stream? } Output: A metis-compatible query plan to return a cohort analysis. """ cohort = plan['cohort'] action = plan['action'] source = plan['source'] # Calculate the start and end dates, in Kronos time, of the # beginning and end of the cohort and action streams that will be # relevant. cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start'])) cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']}) cohort_end = cohort['start'] + cohort_span action_span = timedelta(**{action['unit']: action['repetitions']}) action_end = cohort_end + action_span cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1 action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1 left = _cohort_stream_transform(source, cohort['stream'], cohort_start, cohort_end, cohort.get('transform'), cohort['grouping_key'], cohort['unit']) right = _cohort_stream_transform(source, action['stream'], cohort_start, action_end, action.get('transform'), action['grouping_key'], action['unit']) additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) * action['repetitions']) left.alias = 'cohort' right.alias = 'action' joined = Join(left, right, (Condition(Condition.Op.EQ, Property('cohort.%s' % cohort['grouping_key']), Property('action.%s' % action['grouping_key'])) & Condition(Condition.Op.GTE, Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)) & Condition(Condition.Op.LT, Property('action.%s' % TIMESTAMP_FIELD), Add([Property('cohort.%s' % TIMESTAMP_FIELD), Constant(additional_action_time)])))) user_aggregated = Aggregate( joined, GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD), Property('cohort.%s' % cohort['grouping_key'], alias='group'), Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)]), Constant(DateUnit.unit_to_kronos_time(action['unit']))], alias='action_step')]), [Count([], alias='count')] ) aggregated = Aggregate( user_aggregated, GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD), Property('action_step', alias='action_step')]), [Count([], alias='cohort_actions')]) # TODO(marcua): Also sum up the cohort sizes, join with the plan. return aggregated.to_dict()
def test_join_eq(self): for i in xrange(200): self.kronos_client.put({ self.stream + '1': [{constants.TIMESTAMP_FIELD: random.randint(0, 999), 'a': i, 'b': i + 1}] }) for i in xrange(200): self.kronos_client.put({ self.stream + '2': [{constants.TIMESTAMP_FIELD: random.randint(0, 999), 'a': i + 1, 'b': i + 2}] }) # 1-1 join with property. events = self.query(Join(KronosStream('http://localhost:9191', self.stream + '1', 0, 1000), KronosStream('http://localhost:9191', self.stream + '2', 0, 1000), # left.a == right.b Condition(Condition.Op.EQ, Property('left.b'), Property('right.a'))).to_dict()) self.assertEqual(len(events), 200) for event in events: self.assertEqual(event['left.b'], event['right.a']) # 1-1 join with function. events = self.query(Join(KronosStream('http://localhost:9191', self.stream + '1', 0, 1000), KronosStream('http://localhost:9191', self.stream + '2', 0, 1000), # left.a == (right.a - 1) Condition(Condition.Op.EQ, Property('left.a'), Subtract([Property('right.a'), Constant(1)]))).to_dict()) self.assertEqual(len(events), 200) for event in events: self.assertEqual(event['left.a'], event['right.a'] - 1) # 1-1 eqjoin with filtering. events = self.query( Join(KronosStream('http://localhost:9191', self.stream + '1', 0, 1000), KronosStream('http://localhost:9191', self.stream + '2', 0, 1000), (Condition(Condition.Op.EQ, Property('left.b'), Property('right.a')) & Condition(Condition.Op.GT, Property('left.%s' % constants.TIMESTAMP_FIELD), Add([Property('right.%s' % constants.TIMESTAMP_FIELD), Constant(10)])))).to_dict()) self.assertTrue(len(events) > 0) self.assertTrue(len(events) < 200) for event in events: self.assertEqual(event['left.b'], event['right.a']) self.assertTrue(event['left.%s' % constants.TIMESTAMP_FIELD] > event['right.%s' % constants.TIMESTAMP_FIELD] + 10)