예제 #1
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()
예제 #2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
예제 #3
0
def main(argv):

    options, args = P.parse_commandline(argv)

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options, args))

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = PARAMS.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        with LibraryContext(PARAMS, options, args, argv, "daisy"):
            # A selection of command line arguments are added to PARAMS
            # as 'extras' not implemented in ruffus 2.6.3
            kwargs = collections.defaultdict(dict)
            if options.only_info:
                kwargs["extras"].update({'only_info': True})
                P.PARAMS["only_info"] = True
            if options.is_test:
                kwargs["extras"].update({'is_test': True})
                P.PARAMS["is_test"] = True

            E.debug("construction of workflow started")
            pipeline = ruffus.Pipeline('benchmark')
            # Tool execution
            suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                         map_tool_to_runner,
                                                         config=P.PARAMS,
                                                         **kwargs)

            E.debug("added tools to workflow ")
            # Optionally, add externally computed files as
            # pseudo-tools:
            if "external" in P.PARAMS["setup"]:
                external_runners = add_external_data_to_pipeline(
                    pipeline, config=P.PARAMS, **kwargs)
                tool_runners.extend(external_runners)

            # Optionally, combine tool runs into aggregate
            # outputs. The type of the output is preserved
            # (VCF -> VCF, etc.)
            # For example, call individual members in a trio
            # and then build a combined VCF to analyse mendelian
            # inconsistencies.
            if "collate" in P.PARAMS["setup"]:
                collate_runners = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["collate"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_collate", False):
                    tool_runners = []
                if P.PARAMS["setup"].get("no_collate_metrics", False):
                    collate_runners = []
                E.debug("added collators to workflow ")
            else:
                collate_runners = []

            # Optionally, split up the output before applying
            # additional analyses. The type of the output is preserved
            # (VCF -> VCF, etc).
            # For example, identify false positives, false negatives
            # and true positives and collect metrics individually.
            if "split" in P.PARAMS["setup"]:
                split_runners = add_splits_to_pipeline(
                    pipeline,
                    map_split_to_runner,
                    tool_runners,
                    P.PARAMS["setup"]["split"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_split", False):
                    tool_runners = []
                E.debug("added splitters to workflow ")
            else:
                split_runners = []

            metric_runners = []
            for prefix, r in zip(
                ["tool", "collate", "split"],
                [tool_runners, collate_runners, split_runners]):
                if not r:
                    continue

                metrics = None

                if prefix == "collate" and "collate_metrics" in P.PARAMS[
                        "setup"]:
                    metrics = P.PARAMS["setup"]["collate_metrics"]
                elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["split_metrics"]
                elif "metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["metrics"]
                else:
                    raise KeyError(
                        "configuration file requires a 'setup:metrics' section"
                    )

                # Metric execution
                mm = add_metrics_to_pipeline(pipeline,
                                             metrics,
                                             map_metric_to_runner,
                                             r,
                                             suffix=suffix,
                                             prefix=prefix + "_",
                                             config=P.PARAMS,
                                             **kwargs)

                if len(mm) == 0:
                    raise ValueError(
                        "workflow construction error: "
                        "no metric tasks result for metrics {}".format(
                            metrics))

                metric_runners.extend(mm)
                E.debug("added {}_metrics to workflow".format(prefix))

            # add plot task
            if "aggregate" in P.PARAMS["setup"]:
                aggregate_metrics = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["aggregate"],
                    metric_runners,
                    config=P.PARAMS)

                E.debug("added metric aggregation to workflow")
            else:
                aggregate_metrics = []

            add_upload_to_pipeline(pipeline,
                                   metric_runners + aggregate_metrics,
                                   P.PARAMS)
            E.debug("added upload to workflow".format(prefix))

            # add export task
            export = P.PARAMS["setup"].get("export",
                                           ["tools", "collate", "split"])
            map_export2runner = {
                "collate": collate_runners,
                "tools": tool_runners,
                "split": split_runners
            }

            export_runners = []
            for e in export:
                try:
                    export_runners.extend(map_export2runner[e])
                except KeyError:
                    raise KeyError("unknown export section: {}".format(e))

            add_export_to_pipeline(pipeline,
                                   export_runners,
                                   suffix=suffix,
                                   config=P.PARAMS)

            E.debug("added export to workflow")

            add_all_task_to_pipeline(pipeline,
                                     metric_runners + aggregate_metrics)

            # Collate output files to facilitate analysis
            if "collation" in P.PARAMS:
                collators = add_collations_to_pipeline(pipeline,
                                                       map_collate_to_runner,
                                                       P.PARAMS["collation"],
                                                       config=P.PARAMS)

            E.debug("construction of workflow completed")

    E.stop()