Python coalesce_interval 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: r2.lib.traffic.emr_traffic

메소드/함수: coalesce_interval

hotexamples.com에서의 예제들: 2

Python coalesce_interval - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 r2.lib.traffic.emr_traffic.coalesce_interval에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: traffic.py 프로젝트: GodOfConquest/reddit

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection, jobflow_name, log_path, output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection, jobflow_name, input_path,
                           output_path, log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection, jobflow_name, input_path,
                          output_path, log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)

예제 #2

파일 보기

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection,
                 jobflow_name,
                 log_path,
                 output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection,
                           jobflow_name,
                           input_path,
                           output_path,
                           log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection,
                          jobflow_name,
                          input_path,
                          output_path,
                          log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)