Пример #1
0
def run(conn, args):
    debug = args.debug

    already_processed = set()
    if args.import_file is not None:
        with open(args.import_file, 'r') as f:
            l = [line.strip() for line in f]
        already_processed = set(l)

    export_file = None
    if args.export_file is not None:
        export_file = open(args.export_file, 'a')

    try:
        if not debug:
            logger.info("processing googleclusterdata, "
                        "this might take a *long* time...")
        for table in get_valid_tables():
            logger.info("")
            logger.info("*********************************************")
            logger.info("processing table '{}'".format(table.name))

            start_time = time.time()

            g = table.get_glob()

            filenames = sorted(glob.glob(g))
            num_filenames = len(filenames)
            actually_processed = 0.0

            for i, filename in enumerate(filenames):
                if filename in already_processed:
                    logger.info("skipping file '{}'".format(filename))
                    continue
                logger.info("processing file '{}'".format(filename))

                with gzip.GzipFile(filename, 'r') as f:
                    with conn.cursor() as c:
                        c.copy_from(f, table.name, sep=',', null='')
                if debug:
                    logger.info("skipping remainder because "
                                "we're in debug mode")
                    break
                conn.commit()
                if export_file is not None:
                    export_file.write("{}\n".format(filename))

                actually_processed += 1

                total_elapsed_time = time.time() - start_time
                mean_elapsed_time = total_elapsed_time / actually_processed
                time_to_go = (num_filenames-i-1) * mean_elapsed_time
                logger.info("Estimated time remaining for this table: "
                            "{}".format(format_seconds(time_to_go)))

    finally:
        if export_file is not None:
            export_file.close()
Пример #2
0
def run(conn):
    with conn.cursor() as c:
        for table in get_valid_tables():
            name = table.name
            desc = map(lambda col: col.describe(), table)

            logger.info("dropping table '{}', if it exists".format(name))
            cmd = "DROP TABLE IF EXISTS {};".format(name)
            c.execute(cmd)

            logger.info("creating table '{}'".format(name))
            cmd = "CREATE TABLE {} ({});".format(name, ", ".join(desc))
            c.execute(cmd)

        # execute special commands
        name = "task_usage"
        logger.info("creating index on table {}".format(name))
        cmd = "CREATE INDEX ON {} (start_time, end_time);".format(name)
        c.execute(cmd)
def run(args):
    times = np.arange(args.start, args.end, args.resolution)
    output = np.zeros((len(times), 2))
    output[:, 0] = times

    with h5py.File(args.output, 'w') as h5f:
        h5ds = h5f.require_dataset("cpu_usage",
                                   shape=output.shape, dtype=np.float64)
        h5ds[:] = output

    already_processed = set()
    if args.import_file is not None:
        with open(args.import_file, 'r') as f:
            l = [line.strip() for line in f]
        already_processed = set(l)

    export_file = None
    if args.export_file is not None:
        export_file = open(args.export_file, 'a')

    try:
        table = filter(lambda t: t.name == "task_usage", get_valid_tables())[0]

        start_time = time.time()

        g = table.get_glob()

        filenames = sorted(glob.glob(g))
        num_filenames = len(filenames)
        actually_processed = 0.0

        for i, filename in enumerate(filenames):
            if filename in already_processed:
                logger.info("skipping file '{}'".format(filename))
                continue
            logger.info("processing file '{}'".format(filename))

            with h5py.File(args.output, 'a') as h5f:
                h5ds = h5f.require_dataset("cpu_usage",
                                           shape=output.shape,
                                           dtype=np.float64)
                output[:] = h5ds[:]
                with gzip.GzipFile(filename, 'r') as f:
                    process_csv(f,
                                args.start, args.end, args.resolution,
                                output[:, 1])
                h5ds[:] = output[:]

            if export_file is not None:
                export_file.write("{}\n".format(filename))

            actually_processed += 1

            total_elapsed_time = time.time() - start_time
            mean_elapsed_time = total_elapsed_time / actually_processed
            time_to_go = (num_filenames-i-1) * mean_elapsed_time
            logger.info("Estimated time remaining for this table: "
                        "{}".format(format_seconds(time_to_go)))

    finally:
        if export_file is not None:
            export_file.close()