Пример #1
0
        discard_names = set(discard_names)
    else:
        discard_names = set()
    if not os.path.exists(kt_filecache):
        os.makedirs(kt_filecache)
    kt_appid = Babe.get_config_with_env("kontagent", "KT_APPID", kwargs)
    for hour in enumerate_period_per_hour(start_time, end_time, referent_timezone):
        url = get_url(hour, kt_user, kt_pass, kt_appid)
        log.info("Kontagent: retrieving list: %s" % url)
        s = urllib.urlopen(url).read()
        if s == "No files available":
            continue
        file_urls = json.loads(s)
        if sample_mode and len(file_urls) > 0:
            # Sample mode: just process the first file.
            file_urls = file_urls[:1]
        p = Pool(8)
        downloaded_files = p.map(lambda url: read_url_with_cache(url, kt_user, kt_pass, kt_filecache), file_urls)
        p.close()
        header = kt_msg.replace(partition=[("date", datetime.date(hour.year, hour.month, hour.day)), ("hour", hour.hour)])
        yield header
        gzips = [Popen(['gzip', '-d', '-c', f], stdin=PIPE, stdout=PIPE) for f in downloaded_files]
        for gzip in gzips:
            for row in process_file(hour, gzip.stdout, discard_names):
                yield row
            gzip.stdin.close()
            gzip.wait()
        yield StreamFooter()

Babe.register("pull_kontagent", pull_kontagent)
Пример #2
0
from pybabe import Babe, StreamHeader, StreamMeta


def unpivot(stream, common_fields,
    unpivot_name_field, unpivot_value_field):
    """Unpivot a table. Keep fields lines, use other as values"""
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.replace(
                fields=common_fields + [unpivot_name_field, unpivot_value_field])
            other_fields = [field for field in row.fields
                if not field in common_fields]
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            commons = [getattr(row, StreamHeader.keynormalize(f)) for f in common_fields]
            for field in other_fields:
                yield header.t._make(commons + [field, getattr(row, StreamHeader.keynormalize(field))])


Babe.register('unpivot', unpivot)
Пример #3
0
from pybabe import Babe, StreamHeader, StreamMeta


def unpivot(stream, common_fields, unpivot_name_field, unpivot_value_field):
    """Unpivot a table. Keep fields lines, use other as values"""
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.replace(fields=common_fields +
                                 [unpivot_name_field, unpivot_value_field])
            other_fields = [
                field for field in row.fields if not field in common_fields
            ]
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            commons = [
                getattr(row, StreamHeader.keynormalize(f))
                for f in common_fields
            ]
            for field in other_fields:
                yield header.t._make(
                    commons +
                    [field,
                     getattr(row, StreamHeader.keynormalize(field))])


Babe.register('unpivot', unpivot)
Пример #4
0
        s = urllib.urlopen(url).read()
        if s == "No files available":
            continue
        file_urls = json.loads(s)
        if sample_mode and len(
                file_urls) > 0:  # Sample mode: just process the first file.
            file_urls = file_urls[:1]
        p = Pool(8)
        downloaded_files = p.map(
            lambda url: read_url_with_cache(url, kt_user, kt_pass, kt_filecache
                                            ), file_urls)
        p.close()
        header = kt_msg.replace(partition=[(
            "date",
            datetime.date(hour.year, hour.month, hour.day)), ("hour",
                                                              hour.hour)])
        yield header
        gzips = [
            Popen(['gzip', '-d', '-c', f], stdin=PIPE, stdout=PIPE)
            for f in downloaded_files
        ]
        for gzip in gzips:
            for row in process_file(hour, gzip.stdout, discard_names):
                yield row
            gzip.stdin.close()
            gzip.wait()
        yield StreamFooter()


Babe.register("pull_kontagent", pull_kontagent)