示例#1
0
def aggregate_month(month_date):
    jobflow_name = 'Traffic Processing (%s)' % month_date
    input_path = os.path.join(PROCESSED_DIR, 'day', '%s-*' % month_date)
    output_path = os.path.join(AGGREGATE_DIR, month_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR, slave_instance_type='m2.2xlarge')
    terminate_jobflow(emr_connection, jobflow_name)
示例#2
0
def aggregate_month(month_date):
    jobflow_name = 'Traffic Processing (%s)' % month_date
    input_path = os.path.join(PROCESSED_DIR, 'day', '%s-*' % month_date)
    output_path = os.path.join(AGGREGATE_DIR, month_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR,
                       slave_instance_type='m2.2xlarge')
    terminate_jobflow(emr_connection, jobflow_name)
示例#3
0
def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection, jobflow_name, log_path, output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection, jobflow_name, input_path,
                           output_path, log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection, jobflow_name, input_path,
                          output_path, log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)
示例#4
0
def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection,
                 jobflow_name,
                 log_path,
                 output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection,
                           jobflow_name,
                           input_path,
                           output_path,
                           log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection,
                          jobflow_name,
                          input_path,
                          output_path,
                          log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)