def run(conn, args): debug = args.debug already_processed = set() if args.import_file is not None: with open(args.import_file, 'r') as f: l = [line.strip() for line in f] already_processed = set(l) export_file = None if args.export_file is not None: export_file = open(args.export_file, 'a') try: if not debug: logger.info("processing googleclusterdata, " "this might take a *long* time...") for table in get_valid_tables(): logger.info("") logger.info("*********************************************") logger.info("processing table '{}'".format(table.name)) start_time = time.time() g = table.get_glob() filenames = sorted(glob.glob(g)) num_filenames = len(filenames) actually_processed = 0.0 for i, filename in enumerate(filenames): if filename in already_processed: logger.info("skipping file '{}'".format(filename)) continue logger.info("processing file '{}'".format(filename)) with gzip.GzipFile(filename, 'r') as f: with conn.cursor() as c: c.copy_from(f, table.name, sep=',', null='') if debug: logger.info("skipping remainder because " "we're in debug mode") break conn.commit() if export_file is not None: export_file.write("{}\n".format(filename)) actually_processed += 1 total_elapsed_time = time.time() - start_time mean_elapsed_time = total_elapsed_time / actually_processed time_to_go = (num_filenames-i-1) * mean_elapsed_time logger.info("Estimated time remaining for this table: " "{}".format(format_seconds(time_to_go))) finally: if export_file is not None: export_file.close()
def run(conn): with conn.cursor() as c: for table in get_valid_tables(): name = table.name desc = map(lambda col: col.describe(), table) logger.info("dropping table '{}', if it exists".format(name)) cmd = "DROP TABLE IF EXISTS {};".format(name) c.execute(cmd) logger.info("creating table '{}'".format(name)) cmd = "CREATE TABLE {} ({});".format(name, ", ".join(desc)) c.execute(cmd) # execute special commands name = "task_usage" logger.info("creating index on table {}".format(name)) cmd = "CREATE INDEX ON {} (start_time, end_time);".format(name) c.execute(cmd)
def run(args): times = np.arange(args.start, args.end, args.resolution) output = np.zeros((len(times), 2)) output[:, 0] = times with h5py.File(args.output, 'w') as h5f: h5ds = h5f.require_dataset("cpu_usage", shape=output.shape, dtype=np.float64) h5ds[:] = output already_processed = set() if args.import_file is not None: with open(args.import_file, 'r') as f: l = [line.strip() for line in f] already_processed = set(l) export_file = None if args.export_file is not None: export_file = open(args.export_file, 'a') try: table = filter(lambda t: t.name == "task_usage", get_valid_tables())[0] start_time = time.time() g = table.get_glob() filenames = sorted(glob.glob(g)) num_filenames = len(filenames) actually_processed = 0.0 for i, filename in enumerate(filenames): if filename in already_processed: logger.info("skipping file '{}'".format(filename)) continue logger.info("processing file '{}'".format(filename)) with h5py.File(args.output, 'a') as h5f: h5ds = h5f.require_dataset("cpu_usage", shape=output.shape, dtype=np.float64) output[:] = h5ds[:] with gzip.GzipFile(filename, 'r') as f: process_csv(f, args.start, args.end, args.resolution, output[:, 1]) h5ds[:] = output[:] if export_file is not None: export_file.write("{}\n".format(filename)) actually_processed += 1 total_elapsed_time = time.time() - start_time mean_elapsed_time = total_elapsed_time / actually_processed time_to_go = (num_filenames-i-1) * mean_elapsed_time logger.info("Estimated time remaining for this table: " "{}".format(format_seconds(time_to_go))) finally: if export_file is not None: export_file.close()