def RunOnce(client, job, callback): """Get list of files and call processing function.""" logs_paths = logs_util.ServerLogsPaths('viewfinder', 'full') merged_store = ObjectStore.GetInstance(logs_paths.MERGED_LOGS_BUCKET) start_date = options.options.start_date if options.options.smart_scan: # Search for successful full-scan run in the last week. last_run = yield gen.Task(job.FindLastSuccess, with_payload_key='stats.last_day') if last_run is None: logging.info('No previous successful scan found, rerun with --start_date') callback(None) return last_run_start = last_run['start_time'] if util.HoursSince(last_run_start) < options.options.hours_between_runs: logging.info('Last successful run started at %s, less than %d hours ago; skipping.' % (time.asctime(time.localtime(last_run_start)), options.options.hours_between_runs)) callback(None) return last_day = last_run['stats.last_day'] # Set scan_start to start of previous run - 1d. The extra 1d is in case some logs were pushed to S3 late. # This really recomputes two days (the last day that was successfully processed and the one prior). start_time = util.ISO8601ToUTCTimestamp(last_day, hour=12) - constants.SECONDS_PER_DAY start_date = util.TimestampUTCToISO8601(start_time) logging.info('Last successful analyze_logs run (%s) scanned up to %s, setting analysis start date to %s' % (time.asctime(time.localtime(last_run_start)), last_day, start_date)) # Fetch list of merged logs. files = yield gen.Task(GetMergedLogsFileList, merged_store, logs_paths, start_date) last_day = yield gen.Task(ProcessFiles, merged_store, logs_paths, files) callback(last_day) return
def _GetMetric(self, day, h=12, m=0, s=0): timestamp = util.ISO8601ToUTCTimestamp(day, hour=h, minute=m, second=s) existing_metrics = self._RunAsync(metric.Metric.QueryTimespan, self._client, self._group_key, timestamp, timestamp) if len(existing_metrics) == 0: return None return DotDict(json.loads(existing_metrics[0].payload))
def DayTimeStringsToUTCTimestamp(day, time): """Given day (YYYY-MM-DD) and time (HH:MM:SS:ms) strings, return the timestamp in UTC, or None if parsing failed.""" try: hour, minute, second, _ = re.match(kTimeRe, time).groups() return util.ISO8601ToUTCTimestamp(day, hour=int(hour), minute=int(minute), second=int(second)) except Exception as e: logging.warning('Error parsing day and time strings: %s %s, error: %r' % (day, time, e)) return None
def RunOnce(client, job, apple_id, password, callback): start_date = yield gen.Task(DetermineStartDate, client, job) if start_date is None: logging.info( 'Start date not specified, last run too recent, or smart_scan could not determine a date; exiting.' ) callback(None) return query_dates = [] start_time = util.ISO8601ToUTCTimestamp(start_date) today = util.NowUTCToISO8601() while start_time < time.time(): date = util.TimestampUTCToISO8601(start_time) if date != today: query_dates.append(date) start_time += constants.SECONDS_PER_DAY logging.info('fetching data for dates: %r' % query_dates) results = {} itc = ITunesTrends(apple_id, password, options.options.vendor_id) failed = False for day in query_dates: if failed: break try: result = yield gen.Task(itc.FetchOneDay, day) if not result: # We don't get an exception when iTunesConnect has no data. We also don't want to # fail as there's no way it will have this data later. logging.warning('No data for day %s' % day) else: results[day] = result except Exception: msg = traceback.format_exc() logging.warning( 'Error fetching iTunes Connect data for day %s: %s', day, msg) failed = True if len(results) == 0: callback(None) else: # Replace the entire 'itunes' category of previous metrics. This is so we can fix any processing errors we # may have had. hms = logs_util.kDailyMetricsTimeByLogType['itunes_trends'] yield gen.Task(logs_util.UpdateMetrics, client, results, prefix_to_erase='itunes', dry_run=options.options.dry_run, hms_tuple=hms) callback(sorted(results.keys())[-1])
def RunOnce(client, job, callback): """Get list of files and call processing function.""" merged_store = ObjectStore.GetInstance(logs_util.UserAnalyticsLogsPaths.MERGED_LOGS_BUCKET) start_date = options.options.start_date if options.options.smart_scan: # Search for successful full-scan run in the last week. last_run = yield gen.Task(job.FindLastSuccess, with_payload_key='stats.last_day') if last_run is None: logging.info('No previous successful scan found, rerun with --start_date') callback(None) return last_run_start = last_run['start_time'] if util.HoursSince(last_run_start) < options.options.hours_between_runs: logging.info('Last successful run started at %s, less than %d hours ago; skipping.' % (time.asctime(time.localtime(last_run_start)), options.options.hours_between_runs)) callback(None) return last_day = last_run['stats.last_day'] # Set scan_start to start of previous run - 30d (we need 30 days' worth of data to properly compute # 30-day active users. Add an extra 3 days just in case we had some missing logs during the last run. start_time = util.ISO8601ToUTCTimestamp(last_day, hour=12) - constants.SECONDS_PER_WEEK start_date = util.TimestampUTCToISO8601(start_time) logging.info('Last successful analyze_analytics run (%s) scanned up to %s, setting analysis start date to %s' % (time.asctime(time.localtime(last_run_start)), last_day, start_date)) # Fetch list of merged logs. files = yield gen.Task(GetMergedLogsFileList, merged_store, start_date) day_stats = yield gen.Task(ProcessFiles, merged_store, files) # Write per-day stats to dynamodb. if len(day_stats) > 0: hms = logs_util.kDailyMetricsTimeByLogType['analytics_logs'] yield gen.Task(logs_util.UpdateMetrics, client, day_stats, dry_run=options.options.dry_run, hms_tuple=hms) last_day = sorted(day_stats.keys())[-1] callback(last_day) else: callback(None)
def DetermineStartDate(client, job, callback): """If smart_scan is true, lookup the start date from previous job summaries, otherwise use --start_date. --start_date and job summary days are of the form YYYY-MM-DD. """ start_date = options.options.start_date # Lookup previous runs started in the last week. if options.options.smart_scan: # Search for successful full-scan run in the last week. last_run = yield gen.Task(job.FindLastSuccess, with_payload_key='stats.last_day') if last_run is None: logging.info( 'No previous successful scan found, rerun with --start_date') callback(None) return last_run_start = last_run['start_time'] if (last_run_start + options.options.hours_between_runs * constants.SECONDS_PER_HOUR > time.time()): logging.info( 'Last successful run started at %s, less than %d hours ago; skipping.' % (time.asctime(time.localtime(last_run_start)), options.options.hours_between_runs)) callback(None) return # Start start_date to the last processed day + 1. last_day = last_run['stats.last_day'] start_time = util.ISO8601ToUTCTimestamp( last_day) + constants.SECONDS_PER_DAY start_date = util.TimestampUTCToISO8601(start_time) logging.info( 'Last successful run (%s) scanned up to %s, setting start date to %s' % (time.asctime( time.localtime(last_run_start)), last_day, start_date)) callback(start_date)
def _WriteMetric(self, day, dotdict): new_metric = metric.Metric.Create( self._group_key, 'logs_daily', util.ISO8601ToUTCTimestamp(day, hour=12), json.dumps(dotdict)) self._RunAsync(new_metric.Update, self._client)
def UpdateMetrics(db_client, day_stats, callback, dry_run=True, prefix_to_erase=None, hms_tuple=None): """Write 'day_stats' to the metrics table. First lookup any existing metrics and update them. 'day_stats' is a dictionary of {day_in_iso8601: DotDict}. If 'dry_run' is True, don't commit the changes to the metrics table, but perform all the work and log to info. If 'prefix_to_erase' is not None, we first replace the passed-in prefix with an empty dotdict. If 'hms_tuple' is not None, the timestamp for the metric entry will be with the specified hour/minute/second, otherwise, we use noon. To help with consistency, hms_tuple should come from kDailyMetricsTimeByLogType above. For example, given the existing metric: { itunes: { downloads: { 'US': 5, 'UK': 3 }, update: { ... }}} We can either: - Replace the downloads numbers: (the entire tree under 'prefix_to_erase' gets replaced) UpdateMetrics({'2013-02-01': {'itunes': {'downloads': { 'DE': 3, 'FR': 1 }}}}, prefix_to_erase='itunes.downloads') resulting in: { itunes: { downloads: { 'DE': 3, 'FR': 1 }, update: { ... }}} - Or we can update with partial stats: UpdateMetrics({'2013-02-01': {'itunes': { 'downloads': { 'DE': 3, 'FR': 1 }}}}, replace=False) resulting in: { itunes: { downloads: { 'US': 5, 'UK': 3, 'DE': 3, 'FR': 3 }, update: { ... }}} """ if len(day_stats) == 0: callback() return cluster = metric.LOGS_STATS_NAME group_key = metric.Metric.EncodeGroupKey(cluster, metric.Metric.FindIntervalForCluster(cluster, 'daily')) # Convert YYYY-MM-DD into the timestamp for noon UTC. h, m, s = hms_tuple if hms_tuple is not None else (12, 0, 0) timestamps = [(util.ISO8601ToUTCTimestamp(day, hour=h, minute=m, second=s), day) for day in sorted(day_stats.keys())] # Query Metrics table for all metrics between the timestamps we have data for. existing_metrics = yield gen.Task(metric.Metric.QueryTimespan, db_client, group_key, timestamps[0][0], timestamps[-1][0]) existing_dict = dict((m.timestamp, m) for m in existing_metrics) tasks = [] for t, day in timestamps: data = day_stats[day] prev_metric = existing_dict.get(t, None) payload = json.dumps(data) if prev_metric is None: logging.info('%s: new metric: %r' % (day, payload)) else: prev_payload = prev_metric.payload # We do this twice, it's simpler than making deepcopy work on DotDict. prev_data = DotDict(json.loads(prev_payload)) new_data = DotDict(json.loads(prev_payload)) if prefix_to_erase is not None: # We can't call 'del' on a DotDict's internals, so simply replace with an empty dotdict, we'll be repopulating. new_data[prefix_to_erase] = DotDict() # DotDict doesn't have an update() method. flat = new_data.flatten() flat.update(data.flatten()) new_data = DotDict(flat) payload = json.dumps(new_data) if new_data.flatten() == prev_data.flatten(): logging.info('%s: metric has not changed, skipping' % day) continue else: logging.info('%s: changed metric: %s -> %s' % (day, prev_payload, payload)) if not dry_run: new_metric = metric.Metric.Create(group_key, 'logs_daily', t, payload) tasks.append(gen.Task(new_metric.Update, db_client)) yield tasks callback()
def RunOnce(client, job, callback): """Get list of files and call processing function.""" logs_paths = logs_util.ServerLogsPaths('viewfinder', 'full') merged_store = ObjectStore.GetInstance(logs_paths.MERGED_LOGS_BUCKET) start_date = options.options.start_date if options.options.smart_scan: # Search for successful full-scan run in the last week. last_run = yield gen.Task(job.FindLastSuccess, with_payload_key='stats.last_day') if last_run is None: logging.info( 'No previous successful scan found, rerun with --start_date') callback(None) return last_run_start = last_run['start_time'] if util.HoursSince( last_run_start) < options.options.hours_between_runs: logging.info( 'Last successful run started at %s, less than %d hours ago; skipping.' % (time.asctime(time.localtime(last_run_start)), options.options.hours_between_runs)) callback(None) return # Compute stats for all days since the last run (inclusive). start_date = last_run['stats.last_day'] logging.info( 'Last successful server_log_metrics run (%s) scanned up to %s' % (time.asctime(time.localtime(last_run_start)), start_date)) assert start_date is not None, 'No start date provided. Use --start_date=YYYY-MM-DD or --smart_scan=True' last_day = None if options.options.compute_user_requests: # We look for files going back 32 days (30 because we need 30-day active users, and +2 for extra safety). user_start_time = util.ISO8601ToUTCTimestamp( start_date, hour=12) - 32 * constants.SECONDS_PER_DAY user_start_date = util.TimestampUTCToISO8601(user_start_time) # Fetch list of per-day user request stats. files = yield gen.Task(GetFileList, merged_store, 'user_requests', user_start_date) user_request_stats = yield gen.Task(ComputeUserRequests, merged_store, files) # Write per-day stats to dynamodb. if len(user_request_stats) > 0: # We do not replace the 'active_users' category in previous metrics as we may not be recomputing all data # eg: we need to analyze 30 days of logs to get the full metric info. hms = logs_util.kDailyMetricsTimeByLogType['active_users'] yield gen.Task(logs_util.UpdateMetrics, client, user_request_stats, dry_run=options.options.dry_run, hms_tuple=hms) last_day = sorted(user_request_stats.keys())[-1] if options.options.compute_registration_delay: # We compute stats for days within the registration window (plus an extra two for safety). device_start_time = util.ISO8601ToUTCTimestamp(start_date, hour=12) - \ (kDeviceRegistrationWindowDays + 2) * constants.SECONDS_PER_DAY device_start_date = util.TimestampUTCToISO8601(device_start_time) # However, we search for files for an extra 15 days. This is so that unregistered devices that ping every day # are not counted as starting on the first day of stats. device_search_time = device_start_time - 15 * constants.SECONDS_PER_DAY device_search_date = util.TimestampUTCToISO8601(device_search_time) # Fetch list of merged logs. files = yield gen.Task(GetFileList, merged_store, 'device_details', device_search_date) device_stats = yield gen.Task(ComputeRegistrationDelay, merged_store, files, device_start_date) # Write per-day stats to dynamodb. if len(device_stats) > 0: # Replace the entire 'device_installs.registration' category hms = logs_util.kDailyMetricsTimeByLogType['device_installs'] yield gen.Task(logs_util.UpdateMetrics, client, device_stats, dry_run=options.options.dry_run, hms_tuple=hms, prefix_to_erase='device_installs.registration') last_day_device = sorted(device_stats.keys())[-1] if last_day is None or last_day > last_day_device: # We consider the last successful processing day to be the earlier of the two. last_day = last_day_device if options.options.compute_app_versions: # Look at an extra two days for safety. version_start_time = util.ISO8601ToUTCTimestamp( start_date, hour=12) - 2 * constants.SECONDS_PER_DAY version_start_date = util.TimestampUTCToISO8601(version_start_time) # Fetch list of merged logs. files = yield gen.Task(GetFileList, merged_store, 'device_details', version_start_date) version_stats = yield gen.Task(ComputeAppVersions, merged_store, files) # Write per-day stats to dynamodb. if len(version_stats) > 0: # Replace the entire 'device_installs.registration' category hms = logs_util.kDailyMetricsTimeByLogType['device_count'] yield gen.Task(logs_util.UpdateMetrics, client, version_stats, dry_run=options.options.dry_run, hms_tuple=hms, prefix_to_erase='device_count') last_day_device = sorted(version_stats.keys())[-1] if last_day is None or last_day > last_day_device: # We consider the last successful processing day to be the earlier of the two. last_day = last_day_device if options.options.compute_traces or options.options.compute_aborts: # Fetch an extra 7 days to filter out previously-seen traces. trace_start_time = ( util.ISO8601ToUTCTimestamp(start_date, hour=12) - options.options.extra_trace_days * constants.SECONDS_PER_DAY) trace_start_date = util.TimestampUTCToISO8601(trace_start_time) # Fetch list of merged logs. files = yield gen.Task(GetFileList, merged_store, 'traces', trace_start_date) # Run them separately, we'll be sending separate email reports. if options.options.compute_traces: yield gen.Task(ComputeTraces, merged_store, files, start_date) if options.options.compute_aborts: yield gen.Task(ComputeAborts, merged_store, files, start_date) callback(last_day)