def create_metrics_for_article(article_id, preview=False): global metricized_articles, terms_processed # Get article from the DB... analyzed_article = read_analyzed_article_from_db(article_id) # Create metrics... if analyzed_article: # There are a few different approaches to consider when writing # metric data. # (1) Ensure documents are allocated, and then upsert data for # the change # (2) Upsert an entire doc each time, where all values are zero # except one. # (3) Upsert daily term docs one at a time, then aggregate into # higher level data later # # >> Currently selecting to do the former approach. Results in # more small reads to the DB, but smaller writes. # Get needed date values published = analyzed_article["published"] yyyy = published.year mm = published.month dd = published.day first_of_month = datetime(yyyy, mm, 1) days_in_curr_month = date_util.get_days_in_month(yyyy, published.month) # Iterate over each term in the term histogram term_histogram = analyzed_article["term_histogram"] for term in term_histogram: terms_processed += 1 if not preview: update_daily_metrics(term, yyyy, mm, dd, first_of_month, days_in_curr_month, term_histogram[term]) update_monthly_metrics(term, yyyy, mm, term_histogram[term]) # Increase count and update status after each article... metricized_articles += 1 if preview or metricized_articles % updt_freq == 0: print " * Articles Metricized: %d..." % metricized_articles print " Terms: %d Daily Docs %d Monthly Docs %d" % \ (terms_processed, docs_created_daily, docs_created_monthly) print " Monthly: Read: %s, Create: %s, Write: %s" % \ (mr_time, mc_time, mw_time) print " Daily: Read: %s, Create: %s, Write: %s" % \ (dr_time, dc_time, dw_time) else: print " ERROR: No document with id of '%s' in DB" % article_id
def get_daily_term_data(term, time_start, time_end, granularity): """ Get 'daily' granularity metric data for the given term and time range """ data = [] avg = 0 total = 0 max_val = 0 data_pts = 0 # Query the DB for all documents containing the # given term and within the given date range. db_query = {"term" : term, "date" : {"$gte" : time_start}, "date" : {"$lte" : time_end}} result_set = [] result_set_idx = 0; for result in db_metric_data_daily.find(db_query).sort("date"): result_set.append(result) # Iterate over each month in the requested time range curr_month = time_start while (curr_month < time_end): # Typically the end day is the number of days in the month. # However, if this is the final month in the requested time # range, the day may be earlier based upon the end date. days_in_month = date_util.get_days_in_month( curr_month.year, curr_month.month) if date_util.is_same_month(curr_month, time_end): days_in_month = time_end.day data_pts += days_in_month # Get the next result from the result set (assume idx is in # bounds). If it matches the current month, then use its data # to build the response. Otherwise, we have a gap in the data # which should be filled with an empty result. result = None if result_set_idx < len(result_set): result = result_set[result_set_idx] if result and date_util.is_same_month(curr_month, result["date"]): # Increment result_set_idx, if not # already at the end of the list result_set_idx += 1 if result_set_idx < len(result_set) - 1 else 0 for day in range(curr_month.day, days_in_month+1): val = result["daily"][str(day)] data.append(val) total += val if (val > max_val): max_val = val else: for day in range(curr_month.day, days_in_month + 1): data.append(0) curr_month = date_util.get_next_month(curr_month) # Calculate the final average if data_pts > 0: avg = total / data_pts return data, total, avg, max_val