Python aggregate_interval示例，r2.lib.traffic.emr_traffic.aggregate_interval Python示例

示例#1

0

显示文件

文件： traffic.py 项目： GodOfConquest/reddit

def aggregate_month(month_date):
    jobflow_name = 'Traffic Processing (%s)' % month_date
    input_path = os.path.join(PROCESSED_DIR, 'day', '%s-*' % month_date)
    output_path = os.path.join(AGGREGATE_DIR, month_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR, slave_instance_type='m2.2xlarge')
    terminate_jobflow(emr_connection, jobflow_name)

示例#2

0

显示文件

def aggregate_month(month_date):
    jobflow_name = 'Traffic Processing (%s)' % month_date
    input_path = os.path.join(PROCESSED_DIR, 'day', '%s-*' % month_date)
    output_path = os.path.join(AGGREGATE_DIR, month_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR,
                       slave_instance_type='m2.2xlarge')
    terminate_jobflow(emr_connection, jobflow_name)

示例#3

0

显示文件

文件： traffic.py 项目： GodOfConquest/reddit

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection, jobflow_name, log_path, output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection, jobflow_name, input_path,
                           output_path, log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection, jobflow_name, input_path,
                          output_path, log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)

示例#4

0

显示文件

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection,
                 jobflow_name,
                 log_path,
                 output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection,
                           jobflow_name,
                           input_path,
                           output_path,
                           log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection,
                          jobflow_name,
                          input_path,
                          output_path,
                          log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)