Python coalesce_interval примеры использования

Язык программирования: Python

Пространство имен/Пакет: r2.lib.traffic.emr_traffic

Метод/Функция: coalesce_interval

Примеров на hotexamples.com: 2

Python coalesce_interval - 2 примера найдено. Это лучшие примеры Python кода для r2.lib.traffic.emr_traffic.coalesce_interval, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: traffic.py Проект: GodOfConquest/reddit

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection, jobflow_name, log_path, output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection, jobflow_name, input_path, output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection, jobflow_name, input_path,
                           output_path, log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection, jobflow_name, input_path,
                          output_path, log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)

Пример #2

Показать файл

def process_pixel_log(log_path, fast=False):
    """Process an hourly pixel log file.

    Extract data from raw hourly log and aggregate it and report it. Also
    depending on the specific date and options, aggregate and report the day
    and month. Setting fast=True is appropriate for backfilling as it
    eliminates reduntant steps.

    """

    if log_path.endswith('/*'):
        log_dir = log_path[:-len('/*')]
        date_fields = os.path.basename(log_dir).split('.', 1)[0].split('-')
    else:
        date_fields = os.path.basename(log_path).split('.', 1)[0].split('-')
    year, month, day, hour = (int(i) for i in date_fields)
    hour_date = '%s-%02d-%02d-%02d' % (year, month, day, hour)
    day_date = '%s-%02d-%02d' % (year, month, day)
    month_date = '%s-%02d' % (year, month)

    # All logs from this day use the same jobflow
    jobflow_name = 'Traffic Processing (%s)' % day_date

    output_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    extract_hour(emr_connection,
                 jobflow_name,
                 log_path,
                 output_path,
                 log_uri=AWS_LOG_DIR)

    input_path = os.path.join(PROCESSED_DIR, 'hour', hour_date)
    output_path = os.path.join(AGGREGATE_DIR, hour_date)
    aggregate_interval(emr_connection,
                       jobflow_name,
                       input_path,
                       output_path,
                       log_uri=AWS_LOG_DIR)
    if not fast:
        report_interval(hour_date)

    if hour == 23 or (not fast and (hour == 0 or hour % 4 == 3)):
        # Don't aggregate and report day on every hour
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(AGGREGATE_DIR, day_date)
        aggregate_interval(emr_connection,
                           jobflow_name,
                           input_path,
                           output_path,
                           log_uri=AWS_LOG_DIR)
        if not fast:
            report_interval(day_date)

    if hour == 23:
        # Special tasks for final hour of the day
        input_path = os.path.join(PROCESSED_DIR, 'hour', '%s-*' % day_date)
        output_path = os.path.join(PROCESSED_DIR, 'day', day_date)
        coalesce_interval(emr_connection,
                          jobflow_name,
                          input_path,
                          output_path,
                          log_uri=AWS_LOG_DIR)
        terminate_jobflow(emr_connection, jobflow_name)

        if not fast:
            aggregate_month(month_date)
            report_interval(month_date)