def process_month_hours(month_date, start_hour=0, days=None): """Process hourly logs from entire month. Complete monthly backfill requires running [verify_month_inputs,] process_month_hours, aggregate_month, [verify_month_outputs,] and report_entire_month. """ year, month = month_date.split('-') year, month = int(year), int(month) days = days or xrange(1, calendar.monthrange(year, month)[1] + 1) hours = xrange(start_hour, 24) for day in days: for hour in hours: hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour) log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date) if not s3_key_exists(s3_connection, log_path): log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date) if not s3_key_exists(s3_connection, log_path): print 'Missing log for %s' % hour_date continue print 'Processing %s' % log_path process_pixel_log(log_path, fast=True) hours = xrange(24)
def process_hour(hour_date): """Process hour_date's traffic. Can't fire at the very start of an hour because it takes time to bzip and upload the file to S3. Check the bucket for the file and sleep if it doesn't exist. """ SLEEPTIME = 180 log_dir = os.path.join(RAW_LOG_DIR, hour_date) files_missing = [ os.path.join(log_dir, '%s.log.bz2' % h) for h in g.TRAFFIC_LOG_HOSTS ] files_missing = [ f for f in files_missing if not s3_key_exists(s3_connection, f) ] while files_missing: print 'Missing log(s) %s, sleeping' % files_missing sleep(SLEEPTIME) files_missing = [ f for f in files_missing if not s3_key_exists(s3_connection, f) ] process_pixel_log(os.path.join(log_dir, '*'))
def verify_month_outputs(month_date): """Check existance of all hour, day, month aggregates for month_date.""" year, month = month_date.split('-') year, month = int(year), int(month) missing = [] for day in xrange(1, calendar.monthrange(year, month)[1] + 1): for hour in xrange(24): hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour) for category_cls in traffic_categories: for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'hour')]: path = _get_processed_path(d, hour_date, category_cls, 'part-r-00000') if not s3_key_exists(s3_connection, path): missing.append(hour_date) day_date = '%04d-%02d-%02d' % (year, month, day) for category_cls in traffic_categories: for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'day')]: path = _get_processed_path(d, day_date, category_cls, 'part-r-00000') if not s3_key_exists(s3_connection, path): missing.append(day_date) month_date = '%04d-%02d' % (year, month) for c in traffic_categories: path = _get_processed_path(AGGREGATE_DIR, month_date, category_cls, 'part-r-00000') if not s3_key_exists(s3_connection, path): missing.append(month_date) for d in sorted(list(set(missing))): print d
def verify_month_inputs(month_date): """Check existance of all hourly traffic logs for month_date.""" year, month = month_date.split('-') year, month = int(year), int(month) missing = [] for day in xrange(1, calendar.monthrange(year, month)[1] + 1): for hour in xrange(24): hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour) log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date) if not s3_key_exists(s3_connection, log_path): log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date) if not s3_key_exists(s3_connection, log_path): missing.append(hour_date) for d in missing: print d
def process_hour(hour_date): """Process hour_date's traffic. Can't fire at the very start of an hour because it takes time to bzip and upload the file to S3. Check the bucket for the file and sleep if it doesn't exist. """ SLEEPTIME = 180 log_dir = os.path.join(RAW_LOG_DIR, hour_date) files_missing = [os.path.join(log_dir, "%s.log.bz2" % h) for h in g.TRAFFIC_LOG_HOSTS] files_missing = [f for f in files_missing if not s3_key_exists(s3_connection, f)] while files_missing: print "Missing log(s) %s, sleeping" % files_missing sleep(SLEEPTIME) files_missing = [f for f in files_missing if not s3_key_exists(s3_connection, f)] process_pixel_log(os.path.join(log_dir, "*"))
def get_aggregate(interval, category_cls): """Return the aggregate output file from S3.""" part = 0 data = {} while True: path = _get_processed_path(AGGREGATE_DIR, interval, category_cls, 'part-r-%05d' % part) if not s3_key_exists(s3_connection, path): break # Sometimes S3 doesn't let us read immediately after key is written for i in xrange(5): try: txt = get_text_from_s3(s3_connection, path) except S3ResponseError as e: print 'S3ResponseError on %s, retrying' % path sleep(300) else: break else: print 'Could not retrieve %s' % path raise e for line in txt.splitlines(): tuples = line.rstrip('\n').split('\t') group, uniques, pageviews = tuples[:-2], tuples[-2], tuples[-1] if len(group) > 1: group = tuple(group) else: group = group[0] data[group] = (int(uniques), int(pageviews)) part += 1 if not data: raise ValueError("No data for %s/%s" % (interval, category_cls.__name__)) return data