Exemplo n.º 1
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    tools = glob.glob(
        os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools",
                     "*.py"))

    counter = E.Counter()
    for tool in tools:
        counter.found += 1
        tool_module = re.sub(".py", "", os.path.basename(tool))
        tool_name = re.sub("_", "-", tool_module)
        if tool_name in ("__init__", "cli"):
            c.ignored += 1
            continue

        dest = os.path.join("tools", "{}.rst".format(tool_name))

        if os.path.exists(dest) and not options.output_force:
            counter.skipped += 1
            continue

        with IOTools.openFile(dest, "w") as outf:
            outf.write(TEMPLATE_TOOL.format(**locals()))

        counter.new += 1

    E.info(counter)
    E.stop()
Exemplo n.º 2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--run-id",
                      dest="run_id",
                      type="int",
                      help="numerical identifier of a run [%default]")

    parser.add_option("-d",
                      "--database-url",
                      dest="database_url",
                      type="string",
                      help="database url [%default]")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="only show statements to be executed [%default]")

    parser.set_defaults(
        run_id=None,
        database_url="sqlite:///./csvdb",
        dry_run=False,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    Storage.purge_run_id(options.run_id,
                         options.database_url,
                         dry_run=options.dry_run)

    E.stop()
Exemplo n.º 4
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
Exemplo n.º 5
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--database",
        dest="databases",
        action="append",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-f",
                      "--filter",
                      dest="filter_method",
                      type="choice",
                      choices=("first", "last"),
                      help="only input a selection of results")

    parser.add_option("-t",
                      "--target",
                      dest="target_database",
                      type="string",
                      help="the target database [%default]")

    parser.set_defaults(filter_method=None, databases=[])

    (options, args) = E.start(parser, argv)

    run_id_offset = 0
    instance_id_offset = 0

    for database in options.databases:
        source_db = sqlite3.connect(database)
        is_instance = False
        is_run = False

        cc = source_db.cursor()
        min_run_id = cc.execute("SELECT MIN (id) FROM run").fetchall()[0][0]
        max_run_id = cc.execute("SELECT MAX (id) FROM run").fetchall()[0][0]
        max_instance_id = cc.execute(
            "SELECT MAX (id) FROM instance").fetchall()[0][0]

        E.info("{}: min_run_id={}, max_run_id={}, max_instance_id={}".format(
            database, min_run_id, max_run_id, max_instance_id))

        for line in source_db.iterdump():

            if line.startswith("CREATE TABLE"):
                try:
                    tablename = re.search("CREATE TABLE \"(\S+)\"",
                                          line).groups()[0]
                except AttributeError:
                    tablename = re.search("CREATE TABLE (\S+)",
                                          line).groups()[0]

                is_instance = False
                is_run = False
                if tablename == "run":
                    offset = run_id_offset
                    pos = "first"
                    is_run = True
                elif tablename == "tags":
                    offset = run_id_offset
                    pos = "first"
                elif tablename == "instance":
                    is_instance = True
                elif tablename == "tool_timings":
                    offset = instance_id_offset
                    pos = "last"
                elif tablename == "metric_timings":
                    offset = instance_id_offset
                    pos = "last"
                else:
                    # metric table
                    offset = instance_id_offset
                    pos = "last"

            elif line.startswith("INSERT INTO"):

                if is_instance:
                    i, n = re.search("VALUES\((\d+),(\d+),", line).groups()
                    if apply_run_filter(n, options.filter_method, min_run_id,
                                        max_run_id):
                        line = None
                    else:
                        line = re.sub(
                            "VALUES\({},{},".format(i, n),
                            "VALUES({},{},".format(
                                int(i) + instance_id_offset,
                                int(n) + run_id_offset), line)
                else:
                    if pos == "last":
                        n = re.search(",(\d+)\)", line).groups()[0]
                        line = re.sub(",{}\)".format(n),
                                      ",{})".format(int(n) + offset), line)
                    elif pos == "first":
                        n = re.search("VALUES\((\d+),", line).groups()[0]
                        line = re.sub("VALUES\({},".format(n),
                                      "VALUES({},".format(int(n) + offset),
                                      line)
                        if is_run:
                            if apply_run_filter(n, options.filter_method,
                                                min_run_id, max_run_id):
                                line = None

            if line is not None:
                print(line)

        cc = source_db.cursor()
        run_id_offset += max_run_id
        instance_id_offset += max_instance_id

        E.info("{}: updated offsets to run_id={}, instance_id={}".format(
            database, run_id_offset, instance_id_offset))

    E.stop()
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-l", "--logfile", dest="logfile", type="string",
                      help="name of logfile [default=%default]")

    parser.add_option("-t", "--time", dest="time", type="choice",
                      choices=("seconds", "milliseconds"),
                      help="time to show [default=%default]")

    parser.add_option(
        "--no-reset", dest="reset", action="store_false",
        help="do not reset counters when a new pipeline run started "
        "The default is to reset so that only the counts from the latest "
        "pipeline execution are show "
        "[default=%default]")

    parser.add_option(
        "-f", "--filter-method", dest="filter", type="choice",
        choices=("unfinished", "running", "completed", "all"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-s", "--sort-order", dest="sort_order", type="choice",
        choices=("object", "ncalls", "duration", "percall", "running"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-i", "--ignore-errors", dest="ignore_errors", action="store_true",
        help="ignore errors [default=%default]")

    parser.set_defaults(sections=[],
                        logfile="pipeline.log",
                        filter="all",
                        reset=True,
                        sort_order="duration",
                        time="seconds")

    (options, args) = E.start(parser, argv)

    if options.sections:
        profile_sections = options.sections
    else:
        profile_sections = ("task", "job")

    counts = {}
    for section in profile_sections:
        counts[section] = collections.defaultdict(Counter)

    def line_grouper(filename):
        rx = re.compile("\d{4}-\d{2}-\d{2} ")
        with IOTools.open_file(filename) as infile:
            last_line = None
            for line in infile:
                line = line.strip()
                if not rx.match(line):
                    last_line = " ".join((last_line, line))
                else:
                    if last_line:
                        yield last_line
                    last_line = line
            yield last_line

    for line in line_grouper(options.logfile):

        data = line.split()
        if len(data) < 5:
            continue
        print(line)
        date, time, level, pipeline, source = data[:5]

        if re.search("output generated by", line):
            if options.reset:
                E.info("resetting counts at line=%s" % line[:-1])
                for section in profile_sections:
                    counts[section] = collections.defaultdict(Counter)
            continue

        # filter for log messages from task module
        if source != "task":
            continue

        dt = datetime.datetime.strptime(
            " ".join((date, time)), "%Y-%m-%d %H:%M:%S,%f")

        msg = "".join(data[5:])

        started_task, completed_task, started_job, completed_job = \
            (None, None, None, None)

        if re.search("task.log_at_level.\d+Task=(\S+)", msg):
            checked_task = re.search(
                "task.log_at_level.\d+Task=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg):
            started_infiles, started_job, missing = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg).groups()
        elif re.search("Taskentersqueue=(\S+)", msg):
            started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\]completed", msg):
            completed_infiles, completed_job = re.search(
                "Job=\[(\S+)->(\S+)\]completed", msg).groups()
        elif re.search("CompletedTask=(\S+)", msg):
            completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0]
        elif re.search("UptodateTask=(\S+)", msg):
            completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0]
        else:
            continue

        try:
            if started_task:
                counts["task"][(pipeline, started_task)].add(True, dt, started_task)
            elif completed_task:
                counts["task"][(pipeline, completed_task)].add(False, dt, completed_task)
            elif started_job:
                counts["job"][(pipeline, started_job)].add(True, dt, started_job)
            elif completed_job:
                counts["job"][(pipeline, completed_job)].add(False, dt, completed_job)
            else:
                raise ValueError("unknown action")
        except ValueError as msg:
            if not options.ignore_errors:
                raise ValueError(str(msg) + "\nat line %s" % line)

    def to_milliseconds(d):
        return d.seconds + d.microseconds / 1000

    def to_seconds(d):
        return d.seconds + d.microseconds / 1000000

    if options.time == "milliseconds":
        f = to_milliseconds
    elif options.time == "seconds":
        f = to_seconds

    for section in profile_sections:
        running = []
        rows = []
        for objct, c in list(counts[section].items()):

            # apply filters
            if options.filter in ("unfinished", "running") and c.running == 0:
                continue

            d = f(c.duration)
            if c.calls > 0:
                percall = "%6.3f" % (d / float(c.calls))
            else:
                percall = "na"

            rows.append((section,
                         objct[0],
                         objct[1],
                         c.calls,
                         d,
                         percall,
                         c.running))
            running.extend([x for x, y in c._started.items() if y != 0])

        header = ("section", "pipeline", "object", "ncalls",
                  "duration", "percall", "running")

        options.stdout.write("\t".join((header)) + "\n")
        idx = header.index(options.sort_order)
        rows = sorted(rows, key=lambda x: x[idx])

        options.stdout.write("\n".join(
            ["\t".join(map(str, x)) for x in rows]) + "\n")

        options.stdout.write("#//\n\n")

        if running:
            options.stdout.write("# running %ss\n" % section)
            options.stdout.write("\n".join(map(str, running)) + "\n")
            options.stdout.write("#//\n\n")

    E.stop()