Пример #1
0
def process_month_hours(month_date, start_hour=0, days=None):
    """Process hourly logs from entire month.

    Complete monthly backfill requires running [verify_month_inputs,]
    process_month_hours, aggregate_month, [verify_month_outputs,] and
    report_entire_month.

    """

    year, month = month_date.split('-')
    year, month = int(year), int(month)

    days = days or xrange(1, calendar.monthrange(year, month)[1] + 1)
    hours = xrange(start_hour, 24)

    for day in days:
        for hour in hours:
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date)
            if not s3_key_exists(s3_connection, log_path):
                log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date)
                if not s3_key_exists(s3_connection, log_path):
                    print 'Missing log for %s' % hour_date
                    continue
            print 'Processing %s' % log_path
            process_pixel_log(log_path, fast=True)
        hours = xrange(24)
Пример #2
0
def process_hour(hour_date):
    """Process hour_date's traffic.

    Can't fire at the very start of an hour because it takes time to bzip and
    upload the file to S3. Check the bucket for the file and sleep if it
    doesn't exist.

    """

    SLEEPTIME = 180

    log_dir = os.path.join(RAW_LOG_DIR, hour_date)
    files_missing = [
        os.path.join(log_dir, '%s.log.bz2' % h) for h in g.TRAFFIC_LOG_HOSTS
    ]
    files_missing = [
        f for f in files_missing if not s3_key_exists(s3_connection, f)
    ]

    while files_missing:
        print 'Missing log(s) %s, sleeping' % files_missing
        sleep(SLEEPTIME)
        files_missing = [
            f for f in files_missing if not s3_key_exists(s3_connection, f)
        ]
    process_pixel_log(os.path.join(log_dir, '*'))
Пример #3
0
def verify_month_outputs(month_date):
    """Check existance of all hour, day, month aggregates for month_date."""
    year, month = month_date.split('-')
    year, month = int(year), int(month)
    missing = []

    for day in xrange(1, calendar.monthrange(year, month)[1] + 1):
        for hour in xrange(24):
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            for category_cls in traffic_categories:
                for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'hour')]:
                    path = _get_processed_path(d, hour_date, category_cls,
                                               'part-r-00000')
                    if not s3_key_exists(s3_connection, path):
                        missing.append(hour_date)

        day_date = '%04d-%02d-%02d' % (year, month, day)
        for category_cls in traffic_categories:
            for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'day')]:
                path = _get_processed_path(d, day_date, category_cls,
                                           'part-r-00000')
                if not s3_key_exists(s3_connection, path):
                    missing.append(day_date)

    month_date = '%04d-%02d' % (year, month)
    for c in traffic_categories:
        path = _get_processed_path(AGGREGATE_DIR, month_date, category_cls,
                                   'part-r-00000')
        if not s3_key_exists(s3_connection, path):
            missing.append(month_date)

    for d in sorted(list(set(missing))):
        print d
Пример #4
0
def verify_month_outputs(month_date):
    """Check existance of all hour, day, month aggregates for month_date."""
    year, month = month_date.split('-')
    year, month = int(year), int(month)
    missing = []

    for day in xrange(1, calendar.monthrange(year, month)[1] + 1):
        for hour in xrange(24):
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            for category_cls in traffic_categories:
                for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'hour')]:
                    path = _get_processed_path(d, hour_date, category_cls,
                                               'part-r-00000')
                    if not s3_key_exists(s3_connection, path):
                        missing.append(hour_date)

        day_date = '%04d-%02d-%02d' % (year, month, day)
        for category_cls in traffic_categories:
            for d in [AGGREGATE_DIR, os.path.join(PROCESSED_DIR, 'day')]:
                path = _get_processed_path(d, day_date, category_cls,
                                           'part-r-00000')
                if not s3_key_exists(s3_connection, path):
                    missing.append(day_date)

    month_date = '%04d-%02d' % (year, month)
    for c in traffic_categories:
        path = _get_processed_path(AGGREGATE_DIR, month_date, category_cls,
                                   'part-r-00000')
        if not s3_key_exists(s3_connection, path):
            missing.append(month_date)

    for d in sorted(list(set(missing))):
        print d
Пример #5
0
def process_month_hours(month_date, start_hour=0, days=None):
    """Process hourly logs from entire month.

    Complete monthly backfill requires running [verify_month_inputs,]
    process_month_hours, aggregate_month, [verify_month_outputs,] and
    report_entire_month.

    """

    year, month = month_date.split('-')
    year, month = int(year), int(month)

    days = days or xrange(1, calendar.monthrange(year, month)[1] + 1)
    hours = xrange(start_hour, 24)

    for day in days:
        for hour in hours:
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date)
            if not s3_key_exists(s3_connection, log_path):
                log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date)
                if not s3_key_exists(s3_connection, log_path):
                    print 'Missing log for %s' % hour_date
                    continue
            print 'Processing %s' % log_path
            process_pixel_log(log_path, fast=True)
        hours = xrange(24)
Пример #6
0
def verify_month_inputs(month_date):
    """Check existance of all hourly traffic logs for month_date."""
    year, month = month_date.split('-')
    year, month = int(year), int(month)
    missing = []

    for day in xrange(1, calendar.monthrange(year, month)[1] + 1):
        for hour in xrange(24):
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date)
            if not s3_key_exists(s3_connection, log_path):
                log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date)
                if not s3_key_exists(s3_connection, log_path):
                    missing.append(hour_date)

    for d in missing:
        print d
Пример #7
0
def verify_month_inputs(month_date):
    """Check existance of all hourly traffic logs for month_date."""
    year, month = month_date.split('-')
    year, month = int(year), int(month)
    missing = []

    for day in xrange(1, calendar.monthrange(year, month)[1] + 1):
        for hour in xrange(24):
            hour_date = '%04d-%02d-%02d-%02d' % (year, month, day, hour)
            log_path = os.path.join(RAW_LOG_DIR, '%s.log.gz' % hour_date)
            if not s3_key_exists(s3_connection, log_path):
                log_path = os.path.join(RAW_LOG_DIR, '%s.log.bz2' % hour_date)
                if not s3_key_exists(s3_connection, log_path):
                    missing.append(hour_date)

    for d in missing:
        print d
Пример #8
0
def process_hour(hour_date):
    """Process hour_date's traffic.

    Can't fire at the very start of an hour because it takes time to bzip and
    upload the file to S3. Check the bucket for the file and sleep if it
    doesn't exist.

    """

    SLEEPTIME = 180

    log_dir = os.path.join(RAW_LOG_DIR, hour_date)
    files_missing = [os.path.join(log_dir, "%s.log.bz2" % h) for h in g.TRAFFIC_LOG_HOSTS]
    files_missing = [f for f in files_missing if not s3_key_exists(s3_connection, f)]

    while files_missing:
        print "Missing log(s) %s, sleeping" % files_missing
        sleep(SLEEPTIME)
        files_missing = [f for f in files_missing if not s3_key_exists(s3_connection, f)]
    process_pixel_log(os.path.join(log_dir, "*"))
Пример #9
0
def get_aggregate(interval, category_cls):
    """Return the aggregate output file from S3."""
    part = 0
    data = {}

    while True:
        path = _get_processed_path(AGGREGATE_DIR, interval, category_cls,
                                   'part-r-%05d' % part)
        if not s3_key_exists(s3_connection, path):
            break

        # Sometimes S3 doesn't let us read immediately after key is written
        for i in xrange(5):
            try:
                txt = get_text_from_s3(s3_connection, path)
            except S3ResponseError as e:
                print 'S3ResponseError on %s, retrying' % path
                sleep(300)
            else:
                break
        else:
            print 'Could not retrieve %s' % path
            raise e

        for line in txt.splitlines():
            tuples = line.rstrip('\n').split('\t')
            group, uniques, pageviews = tuples[:-2], tuples[-2], tuples[-1]
            if len(group) > 1:
                group = tuple(group)
            else:
                group = group[0]
            data[group] = (int(uniques), int(pageviews))

        part += 1

    if not data:
        raise ValueError("No data for %s/%s" %
                         (interval, category_cls.__name__))

    return data
Пример #10
0
def get_aggregate(interval, category_cls):
    """Return the aggregate output file from S3."""
    part = 0
    data = {}

    while True:
        path = _get_processed_path(AGGREGATE_DIR, interval, category_cls,
                                   'part-r-%05d' % part)
        if not s3_key_exists(s3_connection, path):
            break

        # Sometimes S3 doesn't let us read immediately after key is written
        for i in xrange(5):
            try:
                txt = get_text_from_s3(s3_connection, path)
            except S3ResponseError as e:
                print 'S3ResponseError on %s, retrying' % path
                sleep(300)
            else:
                break
        else:
            print 'Could not retrieve %s' % path
            raise e

        for line in txt.splitlines():
            tuples = line.rstrip('\n').split('\t')
            group, uniques, pageviews = tuples[:-2], tuples[-2], tuples[-1]
            if len(group) > 1:
                group = tuple(group)
            else:
                group = group[0]
            data[group] = (int(uniques), int(pageviews))

        part += 1

    if not data:
        raise ValueError("No data for %s/%s" % (interval,
                                                category_cls.__name__))

    return data