def test_stream_inversion_timestamps(self): """ Test that if stream is inverted, timestamp of last action is the timestamp of the last action in the previous funnel step. """ from analysis import IdentityDict from analysis import _stream_earliest_action client = Mock() client.get = Mock(side_effect=[self.get_stream5()]) start = datetime.datetime(2014,3,20) end = datetime.datetime(2014,3,21) step = FunnelStep('stream5', invert=True) last_user_action = {'0': datetime_to_kronos_time(start), '1': datetime_to_kronos_time(start)} step_output = _stream_earliest_action(client, step, start, end, timedelta_to_kronos_time( datetime.timedelta(minutes=5)), last_user_action, {'userId': IdentityDict()}) user_action = step_output['user_action'] self.assertEqual(len(user_action), 1) self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
def test_stream_inversion_timestamps(self): """ Test that if stream is inverted, timestamp of last action is the timestamp of the last action in the previous funnel step. """ from analysis import IdentityDict from analysis import _stream_earliest_action client = Mock() client.get = Mock(side_effect=[self.get_stream5()]) start = datetime.datetime(2014, 3, 20) end = datetime.datetime(2014, 3, 21) step = FunnelStep('stream5', invert=True) last_user_action = { '0': datetime_to_kronos_time(start), '1': datetime_to_kronos_time(start) } step_output = _stream_earliest_action( client, step, start, end, timedelta_to_kronos_time(datetime.timedelta(minutes=5)), last_user_action, {'userId': IdentityDict()}) user_action = step_output['user_action'] self.assertEqual(len(user_action), 1) self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
def funnel_analyze(client, streams, start, end, end_first_funnel_step, user_id_mappers, user_filter, fuzzy_time=timedelta(minutes=5)): """ `streams`: a list of FunnelStep objects, each representing a step in the funnel. The funnel is composed from these objects. `start`/`end`: the start and end datetimes to analyze. `end_first_funnel_step`: the end time of the first funnel step. You sometimes want this to be earlier than the rest of the other steps so you can study how a cohort takes certain actions down the line. `user_id_mappers`: a dictionary of the form {user_id_field: user_id_mapping_function}. A user_id_field entry should exist for any user_id fieldname of `streams` subsequent to the first stream in the funnel. For example, if `streams` is: [(s1, f1, 'userId'), (s2, f2, 'userId'), (s3, f3, 'username')], then user_id_mappings should be: {'username': function_from_userId_to_username(userId)} `user_filter`: a function that returns True/False depending on whether an event from a user should be considered (for segmentation, for instance). If user_filter is None, all users will be accepted. `fuzzy_time`: a timedelta representing the time that two events in subsequent streams can be out-of-order with one-another. """ assert end >= end_first_funnel_step streams, user_id_mappers = _sanity_check_args(streams, user_id_mappers) last_user_action = FilterCache(user_filter) fuzzy_time = timedelta_to_kronos_time(fuzzy_time) funnel_output = FunnelOutput() user_id_mappings = {} for idx, stream in enumerate(streams): log.debug('Processing stream', stream.stream_name) step_end = end if idx == 0: user_id_mappings[stream.user_field] = IdentityDict() step_end = end_first_funnel_step output = _stream_earliest_action(client, stream, start, step_end, fuzzy_time, last_user_action, user_id_mappings) funnel_output.add(output) last_user_action = output['user_action'] # For the first stream in the funnel, load the mappings to other # user_id formats we'll find in subsequent streams. if idx == 0: log.debug('Loading user_id mappings') _load_user_id_mappings(user_id_mappings, user_id_mappers, last_user_action) return funnel_output
def filter_and_sum(self, start_time, end_time): """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2. For all events between `start_time` and `end_time`, create an event for every 20-minute interval of events that contains the sum of `b` when `a`==2. """ events = self.client.get(self.stream, start_time, end_time) counts = defaultdict(int) grouping_minutes = timedelta_to_kronos_time(self.bucket_width) for event in events: if event['a'] == 2: counts[event['@time'] - (event['@time'] % grouping_minutes)] += event['b'] for group_time in sorted(counts.iterkeys()): yield {'@time': group_time, 'b_sum': counts[group_time]}