discard_names = set(discard_names) else: discard_names = set() if not os.path.exists(kt_filecache): os.makedirs(kt_filecache) kt_appid = Babe.get_config_with_env("kontagent", "KT_APPID", kwargs) for hour in enumerate_period_per_hour(start_time, end_time, referent_timezone): url = get_url(hour, kt_user, kt_pass, kt_appid) log.info("Kontagent: retrieving list: %s" % url) s = urllib.urlopen(url).read() if s == "No files available": continue file_urls = json.loads(s) if sample_mode and len(file_urls) > 0: # Sample mode: just process the first file. file_urls = file_urls[:1] p = Pool(8) downloaded_files = p.map(lambda url: read_url_with_cache(url, kt_user, kt_pass, kt_filecache), file_urls) p.close() header = kt_msg.replace(partition=[("date", datetime.date(hour.year, hour.month, hour.day)), ("hour", hour.hour)]) yield header gzips = [Popen(['gzip', '-d', '-c', f], stdin=PIPE, stdout=PIPE) for f in downloaded_files] for gzip in gzips: for row in process_file(hour, gzip.stdout, discard_names): yield row gzip.stdin.close() gzip.wait() yield StreamFooter() Babe.register("pull_kontagent", pull_kontagent)
from pybabe import Babe, StreamHeader, StreamMeta def unpivot(stream, common_fields, unpivot_name_field, unpivot_value_field): """Unpivot a table. Keep fields lines, use other as values""" for row in stream: if isinstance(row, StreamHeader): header = row.replace( fields=common_fields + [unpivot_name_field, unpivot_value_field]) other_fields = [field for field in row.fields if not field in common_fields] yield header elif isinstance(row, StreamMeta): yield row else: commons = [getattr(row, StreamHeader.keynormalize(f)) for f in common_fields] for field in other_fields: yield header.t._make(commons + [field, getattr(row, StreamHeader.keynormalize(field))]) Babe.register('unpivot', unpivot)
from pybabe import Babe, StreamHeader, StreamMeta def unpivot(stream, common_fields, unpivot_name_field, unpivot_value_field): """Unpivot a table. Keep fields lines, use other as values""" for row in stream: if isinstance(row, StreamHeader): header = row.replace(fields=common_fields + [unpivot_name_field, unpivot_value_field]) other_fields = [ field for field in row.fields if not field in common_fields ] yield header elif isinstance(row, StreamMeta): yield row else: commons = [ getattr(row, StreamHeader.keynormalize(f)) for f in common_fields ] for field in other_fields: yield header.t._make( commons + [field, getattr(row, StreamHeader.keynormalize(field))]) Babe.register('unpivot', unpivot)
s = urllib.urlopen(url).read() if s == "No files available": continue file_urls = json.loads(s) if sample_mode and len( file_urls) > 0: # Sample mode: just process the first file. file_urls = file_urls[:1] p = Pool(8) downloaded_files = p.map( lambda url: read_url_with_cache(url, kt_user, kt_pass, kt_filecache ), file_urls) p.close() header = kt_msg.replace(partition=[( "date", datetime.date(hour.year, hour.month, hour.day)), ("hour", hour.hour)]) yield header gzips = [ Popen(['gzip', '-d', '-c', f], stdin=PIPE, stdout=PIPE) for f in downloaded_files ] for gzip in gzips: for row in process_file(hour, gzip.stdout, discard_names): yield row gzip.stdin.close() gzip.wait() yield StreamFooter() Babe.register("pull_kontagent", pull_kontagent)