def process_pixel_log(log_path, fast=False): """Process an hourly pixel log file. Extract data from raw hourly log and aggregate it and report it. Also depending on the specific date and options, aggregate and report the day and month. Setting fast=True is appropriate for backfilling as it eliminates reduntant steps. """ if log_path.endswith('/*'): log_dir = log_path[:-len('/*')] date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-') else: date_fields = os.path.basename(log_path).split('.', 1)[0].split('-') year, month, day, hour = (int(i) for i in date_fields) hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour) day_date = '%s-%02d-%02d' % (year, month, day) month_date = '%s-%02d' % (year, month) # All logs from this day use the same jobflow jobflow_name = 'Traffic Processing (%s)' % day_date output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date) extract_hour(emr_connection, jobflow_name, log_path, output_path, log_uri=AWS_LOG_DIR) input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date) output_path = os.path.join(AGGREGATE_DIR, hour_date) aggregate_interval(emr_connection, jobflow_name, input_path, output_path, log_uri=AWS_LOG_DIR) if not fast: report_interval(hour_date) if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)): # Don't aggregate and report day on every hour input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date) output_path = os.path.join(AGGREGATE_DIR, day_date) aggregate_interval(emr_connection, jobflow_name, input_path, output_path, log_uri=AWS_LOG_DIR) if not fast: report_interval(day_date) if hour == 23: # Special tasks for final hour of the day input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date) output_path = os.path.join(PROCESSED_DIR, 'day', day_date) coalesce_interval(emr_connection, jobflow_name, input_path, output_path, log_uri=AWS_LOG_DIR) terminate_jobflow(emr_connection, jobflow_name) if not fast: aggregate_month(month_date) report_interval(month_date)