def _generate_daily_hour_buckets(from_date, to_date): """ Generate a list of hour level timeslots, for each day from the interval. """ timeslot_ranges = [] day_timeslots = gen_timeslots(from_date, to_date, level='day') for ts in day_timeslots: from_date_d = timeslot_to_datetime(ts) to_date_d = from_date_d.replace(hour=23, minute=59) hourly_levels = list( gen_timeslots(from_date_d, to_date_d, level='hour')) timeslot_ranges.append(hourly_levels) return timeslot_ranges
def aggregate_stats(user, channel, from_, to_, level, stats=('volume', 'latency')): data = {} for a in stats: data[a] = [] by_ts = {} for stat in ServiceChannelStats.objects.by_time_span(user, channel, start_time=from_, end_time=to_, level=level): by_ts[stat.time_slot] = stat counts = defaultdict(int) for slot in gen_timeslots(from_, to_, level): for stat in stats: stat_obj = by_ts.get(slot, None) if stat_obj: value = getattr( stat_obj, 'average_latency' if stat == 'latency' else stat) else: value = 0 data[stat].append([timeslot_to_timestamp_ms(slot), value]) counts[stat] += value return data, counts
def get_time_data(groups, y_axis): total_counts = defaultdict(int) total_items = defaultdict(int) data = defaultdict(list) for slot in gen_timeslots(from_ts, to_ts): timestamp = timeslot_to_timestamp_ms(slot) features_data = groups.get(slot, {}) for feature in y_axis: feature_key = get_feature_key(feature) if features_data.get(feature_key): count = _get_count(features_data[feature_key]) total_counts[feature_key] += count total_items[feature_key] += 1 data[feature_key].append([timestamp, count]) else: data[feature_key].append([timestamp, 0]) if plot_type == 'response-time': # return average as result result_counts = defaultdict(float) for key, value in total_counts.iteritems(): if total_items.get(key): result_counts[key] = round(value / total_items[key], 2) else: result_counts[key] = 0 else: result_counts = total_counts return data, result_counts
def _get_data(int_id): data = [] for slot in gen_timeslots(from_dt, to_dt, level): timestamp = timeslot_to_timestamp_ms(slot) count = ts_counts.get(slot, 0) data.append((timestamp, count)) return data
def _get_performance_stats(user, channel, from_, to_, level, stats_type): """ Return list of items for Performance stats graph """ if not isinstance(stats_type, list): raise RuntimeError('stats_type should be an array') result = [] for stype in stats_type: if stype not in [ 'number_of_posts', 'number_of_actionable_posts', 'number_of_impressions', 'number_of_clicks', 'number_of_rejected_posts' ]: raise RuntimeError("unsupported stats_type %s" % stype) values = _get_channel_stats_values(user, channel, from_, to_, level, stype) data = [] count = 0 for slot in gen_timeslots(from_, to_, level): value = values.get(slot, 0) data.append([timeslot_to_timestamp_ms(slot), value]) count += value result.append(dict(data=data, label=stype.split("_")[2], count=count)) return jsonify(ok=True, list=result, level=level)
def purge_days(channel): ''' From now, purge days that we want to maintain in our history, that have not been purged yet. ''' # for all the days in the intersection between [last_purged, today], [3 days ago, today] if channel.last_purged: range_start = utc(channel.last_purged) else: range_start = now() - relativedelta(days=14) days_to_purge = list(gen_timeslots(range_start, now(), level='day')) trend_stats = [0, 0, 0] topic_stats = [0, 0, 0] for day in days_to_purge: topic_res = mark_and_sweep_topics(channel, day) topic_stats = [x + y for x, y in zip(topic_stats, topic_res)] #LOGGER.debug("TOPIC STATS: %s", topic_res) trend_res = purge_corresponding_trends(channel=channel, timeslot=day) trend_stats = [x + y for x, y in zip(trend_stats, trend_res)] return days_to_purge, topic_stats, trend_stats
def purge_months(channel): ''' From now, purge months that we want to maintain in our history, that have not been purged yet. ''' if channel.last_purged: range_start = utc(channel.last_purged) else: range_start = now() - relativedelta(months=2) mday = localtime().tm_mday if mday > 7: range_end = now() else: range_end = now() - relativedelta(months=1) months_to_purge = [] trend_stats = [0, 0, 0] topic_stats = [0, 0, 0] if range_start <= range_end: months_to_purge = list( gen_timeslots(range_start, range_end, level='month')) for month in months_to_purge: topic_res = mark_and_sweep_topics(channel, month) topic_stats = [x + y for x, y in zip(topic_stats, topic_res)] #LOGGER.debug("TOPIC STATS: %s", topic_res) trend_res = purge_corresponding_trends(channel=channel, timeslot=month) trend_stats = [x + y for x, y in zip(trend_stats, trend_res)] return months_to_purge, topic_stats, trend_stats
def compute_customer_timeline(customer, from_dt, to_dt): def _get_platform(event): platform = event._t[0] if platform.endswith('Post') and platform != 'Post': platform = platform[:-len('Post')] return platform timeline_data = [] for monthly_slot in reversed( list(timeslot.gen_timeslots(from_dt, to_dt, 'month'))): _month_start, _month_end = timeslot.Timeslot(monthly_slot).interval _month_events_count = Event.objects.range_query_count( from_dt, to_dt, customer) if not _month_events_count: continue if _month_start.month == to_dt.month: month_label = 'This Month' elif _month_start.month == to_dt.month - 1: month_label = 'Last Month' else: month_label = _month_start.strftime('%B') timeline_data.append([month_label, []]) for daily_slot in reversed( list(timeslot.gen_timeslots(from_dt, to_dt, 'day'))): _day_start, _day_end = timeslot.Timeslot(daily_slot).interval _day_events = list( Event.objects.range_query(max(utc(from_dt), _day_start), min(utc(to_dt), _day_end), customer)) if not _day_events: continue day_label = _day_start.strftime('%b %d') timeline_data[-1][-1].append([day_label, []]) grouper = itertools.groupby(_day_events, _get_platform) for platform, platform_events in grouper: _events = list(platform_events) event_interval_ids = (str(_events[0].id), str(_events[-1].id)) timeline_data[-1][-1][-1][-1].append( (platform, len(_events), event_interval_ids)) return customer, timeline_data
def _generate_day_level_ranges(from_date, to_date): """ Generate a bunch of [from-date, to-date] ranges for each month in the interval. """ timeslot_ranges = [] month_timeslots = gen_timeslots(from_date, to_date, level='month') for ts in month_timeslots: from_date_m = timeslot_to_datetime(ts) timeslot_ranges.append(_get_month_day_range(from_date_m)) return timeslot_ranges
def by_time_span(self, channel=None, parent_topic=None, intentions=None, statuses=None, agents=None, languages=None, from_ts=None, to_ts=None, limit=100): # Use the aggregation framework to resolve the counts: # match on channel + slot + hashed_parents [+ status [+ intention_type ]] # group on topic, sum(leaf or node count?) # sort(count, -1) # limit(100) F = ChannelHotTopics.F from_ts = Timeslot(from_ts).timeslot to_ts = Timeslot(to_ts or from_ts).timeslot time_range = list(gen_timeslots(from_ts, to_ts, closed_range=False)) assert len(time_range) <= 7, "Max allowed range is 7 days, got %s %s" % (len(time_range), time_range) if len(time_range) == 1: time_query = {F("time_slot"): time_range[0]} else: time_query = {F("time_slot"): {"$in": time_range}} channel_num = get_channel_num(channel) if parent_topic is None: parents = [] else: parents = get_topic_hash(parent_topic) intention_ids = set(intentions or [ALL_INTENTIONS_ID]) intention_ids = map(get_intention_id, intention_ids) statuses = set(statuses or SpeechActMap.STATUS_NAME_MAP) statuses = map(get_status_code, statuses) languages = map(get_lang_id, languages or []) match_query_base = { F("channel_num") : channel_num, F("status") : {"$in" : statuses}, F("hashed_parents") : parents, } match_query_base.update(time_query) agent_ids = [a.agent_id for a in (agents or [])] or [ALL_AGENTS] match_query_filters = { "es.at": {"$in": agent_ids}, "es.in": {"$in": intention_ids} } match_query_filters.update(make_lang_query(languages)) return self.execute_pipeline(match_query_base, match_query_filters, limit)
def _get_data(from_dt, to_dt, level, pairs, stat_type): count = len(pairs) date_counts = defaultdict(int) total = 0 for p in pairs: #p[0] - time slot #p[1] - increment date_counts[p[0]] += p[1] total += p[1] data = [] for slot in gen_timeslots(from_dt, to_dt, level): js_time_stamp = timeslot_to_timestamp_ms(slot) data.append((js_time_stamp, date_counts[slot])) if stat_type == 'clicks': count = total return count, data
def get_time_data(self, groups, y_axis): """ Return data formated in a FLOT specific format; eg. [[time, count], [time, count]] so that we can use it for time plots """ real_counts = defaultdict(int) # We need to actually count the response volume across this data, not timeslots # for an accurate average over response time for feature in y_axis: feature_key = self.get_feature_key(feature) for _, value in groups.iteritems(): if feature_key in value: real_counts[feature_key] += value[feature_key].get('rv', 0) total_counts = defaultdict(int) total_items = defaultdict(int) data = defaultdict(list) for slot in gen_timeslots(self.from_ts, self.to_ts): timestamp = timeslot_to_timestamp_ms(slot) features_data = groups.get(slot, {}) for feature in y_axis: feature_key = self.get_feature_key(feature) if features_data.get(feature_key): count = features_data[feature_key].get('count', 0) total_counts[ feature_key] += count * features_data[feature_key].get( 'rv', 1) total_items[feature_key] += 1 data[feature_key].append([timestamp, count]) else: data[feature_key].append([timestamp, 0]) result_counts = defaultdict(float) for key, value in total_counts.iteritems(): if total_items.get(key): if real_counts[key]: result_counts[key] = round(value / real_counts[key], 2) else: result_counts[key] = 0 else: result_counts[key] = 0 return data, result_counts, total_items
def get_time_data(self, groups, y_axis): """ Return data formated in a FLOT specific format; eg. [[time, count], [time, count]] so that we can use it for time plots """ total_counts = defaultdict(int) total_items = defaultdict(int) data = defaultdict(list) for slot in gen_timeslots(self.from_ts, self.to_ts): timestamp = timeslot_to_timestamp_ms(slot) features_data = groups.get(slot, {}) for feature in y_axis: feature_key = self.get_feature_key(feature) if features_data.get(feature_key): count = features_data[feature_key].get('count', 0) total_counts[feature_key] += count total_items[feature_key] += 1 data[feature_key].append([timestamp, count]) else: data[feature_key].append([timestamp, 0]) return data, total_counts, total_items
def purge_corresponding_trends(channel, timeslot): ts_date, ts_level = decode_timeslot(timeslot) sub_level = {"month": "day", "day": "hour"}[ts_level] range_start = ts_date if "month" == ts_level: range_end = ts_date + relativedelta(months=1) else: range_end = ts_date + relativedelta(days=1) timeslots_to_purge = list( gen_timeslots(range_start, range_end, level=sub_level))[:-1] topics = trends_find_topics(timeslot, channel) trend_stats = [0, 0, 0] total_number = len(timeslots_to_purge) for i, ts in enumerate(timeslots_to_purge): LOGGER.info( 'timeslot info: channel: %s; current timeslot "%s"; %sth timeslot of %s timeslots', channel.title, decode_timeslot(ts), i, total_number) trend_res = mark_and_sweep_trends(channel, ts, topics) trend_stats = [x + y for x, y in zip(trend_stats, trend_res)] return tuple(trend_stats)
except Channel.DoesNotExist, e: return jsonify(ok=False, error=str(e)) from_dt, to_dt = parse_date_interval(data['from'], data['to']) #level = guess_timeslot_level(from_dt, to_dt) #print from_dt, to_dt, level intention_type_ids = [ SATYPE_NAME_TO_ID_MAP[intention] for intention in data['intentions'] ] intention_types = defaultdict(int) for speech_act in SpeechActMap.objects.find_by_user( user, channels__in = [channel.id], intention_type_id__in = intention_type_ids, time_slot__in = list(gen_timeslots(from_dt, to_dt, 'hour')) ): intention_types[speech_act.intention_type_id] += 1 res = [] for (intention_type_id, count) in intention_types.items(): res.append({ 'label': SATYPE_ID_TO_NAME_MAP[str(intention_type_id)], 'data': count}) return jsonify(ok=True, list=res) @app.route('/performance/trends/json', methods=['POST']) @login_required() def performance_trends(user):
def compute_account_stats(account, idx, from_date, to_date, levels=('hour', 'day'), output_stream=None, raise_on_diffs=False, test_mode=True, ignore_purging=False, ignore_topics=False): from solariat_bottle.db.channel.base import Channel from solariat_bottle.db.post.utils import get_platform_class from solariat_bottle.db.speech_act import SpeechActMap from solariat.utils.timeslot import gen_timeslots start_processing = datetime.now() all_channels = Channel.objects.find(account=account)[:] all_channels = [c for c in all_channels if not c.is_service] if not all_channels: return Post = get_platform_class(all_channels[0].platform) computed_months = list(gen_timeslots(from_date, to_date, level='month')) base_channels = [c for c in all_channels if not c.is_smart_tag] timeslot_ranges = _generate_daily_hour_buckets(from_date, to_date) post_count = 0 for day_timeslot in timeslot_ranges: # Since we don't want to get the posts for every channel, but we also want to keep # post batches small so we don't overflow memory, then we need to do partial upserts # on a hourly timeslot basis on day level basis trends. In order to do this we need # to keep track of which stats were partially computed so we increment values instead # of just removing + batch inserting partial_updates = {} channel_trends_caches = {} for channel in all_channels: partial_updates[channel.id] = {'ctt': set([]), 'cht': set([])} channel_trends_caches[channel.id] = {} day_speech_act_filter = _compute_sam_match_query( base_channels, day_timeslot[0], day_timeslot[-1]) post_ids = [ sa['pt'] for sa in SpeechActMap.objects.coll.find(day_speech_act_filter) ] if len(post_ids) > MAX_BATCH_SIZE: ## This is really in case of very high load channels. Go in batches of maximum ## MAX_BATCH_SIZE so we don't lock mongo connection sams_batches = int(ceil(len(post_ids) / float(MAX_BATCH_SIZE))) else: sams_batches = 1 for sams_batch_idx in xrange(sams_batches): # Go in MAX_BATCH_SIZE increments through the posts from_idx = sams_batch_idx * MAX_BATCH_SIZE to_idx = (sams_batch_idx + 1) * MAX_BATCH_SIZE posts = Post.objects.find(id__in=post_ids[from_idx:to_idx])[:] for channel in all_channels: # For now always ignore purging, it's a huge performance leak if not ignore_topics: chts_cache, ctts_cache = _process_channel( channel, posts, computed_months, True, levels, channel_trends_caches[channel.id]) # Now do the partial updates on hourly level stats _upsert_channel_topic_trends( ctts_cache, partial_keys=partial_updates[channel.id]['ctt']) _upsert_channel_hot_topics( chts_cache, partial_keys=partial_updates[channel.id]['cht']) else: _process_channel(channel, posts, computed_months, True, levels, channel_trends_caches[channel.id]) post_count += len(posts) logger.info("Finished processing %s posts in %s " % (post_count, datetime.now() - start_processing)) _memory_usage_psutil() _upsert_channel_trends(channel_trends_caches) for channel in all_channels: days_to_purge = list(gen_timeslots(from_date, to_date, level='day')) top_topics = set([]) for time_slot in days_to_purge: _get_top_topics(channel, time_slot, top_topics, 0) if not ignore_topics: _update_monthly_cht_values(channel, from_date, to_date, top_topics) logger.info( "Computed stats computations for account %s with post count %s in %s." % (account.name, post_count, datetime.now() - start_processing))