def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    tools = glob.glob(
        os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools",
                     "*.py"))

    counter = E.Counter()
    for tool in tools:
        counter.found += 1
        tool_module = re.sub(".py", "", os.path.basename(tool))
        tool_name = re.sub("_", "-", tool_module)
        if tool_name in ("__init__", "cli"):
            c.ignored += 1
            continue

        dest = os.path.join("tools", "{}.rst".format(tool_name))

        if os.path.exists(dest) and not options.output_force:
            counter.skipped += 1
            continue

        with IOTools.openFile(dest, "w") as outf:
            outf.write(TEMPLATE_TOOL.format(**locals()))

        counter.new += 1

    E.info(counter)
    E.stop()
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")
        bams = resolve_argument(params.bam, ",")
        reference_fasta = get_reference(params)

        statements, gvcfs = [], []
        # TODO: sort out multi-threading
        for idx, bam in enumerate(bams.split(",")):
            output = prefix + "." + str(idx) + ".g.vcf"
            gvcfs.append(output)

            if os.path.exists(output):
                E.info("{} already exists - skipped".format(output))
                continue

            statements.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--emitRefConfidence GVCF "
                "--logging_level INFO "
                "--log_to_file {prefix}.HaplotypeCaller.{idx}.log "
                "{params.haplotypecaller} "
                "--out {output} "
                ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals()))

        if statements:
            self.run_statements(statements, job_memory="4G")

        stmnts = []
        gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs])
        vcf_output = prefix + ".raw.vcf.gz"
        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type GenotypeGVCFs "
                      "--reference_sequence {reference_fasta} "
                      "{gvcfs} "
                      "--logging_level INFO "
                      "--log_to_file {prefix}.GenotypeGVCFs.log "
                      "{params.genotypegvcfs} "
                      "--out {vcf_output} "
                      ">& {prefix}.GenotypeGVCFs".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="4G")
示例#3
0
    def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
示例#4
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--database",
        dest="databases",
        action="append",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-f",
                      "--filter",
                      dest="filter_method",
                      type="choice",
                      choices=("first", "last"),
                      help="only input a selection of results")

    parser.add_option("-t",
                      "--target",
                      dest="target_database",
                      type="string",
                      help="the target database [%default]")

    parser.set_defaults(filter_method=None, databases=[])

    (options, args) = E.start(parser, argv)

    run_id_offset = 0
    instance_id_offset = 0

    for database in options.databases:
        source_db = sqlite3.connect(database)
        is_instance = False
        is_run = False

        cc = source_db.cursor()
        min_run_id = cc.execute("SELECT MIN (id) FROM run").fetchall()[0][0]
        max_run_id = cc.execute("SELECT MAX (id) FROM run").fetchall()[0][0]
        max_instance_id = cc.execute(
            "SELECT MAX (id) FROM instance").fetchall()[0][0]

        E.info("{}: min_run_id={}, max_run_id={}, max_instance_id={}".format(
            database, min_run_id, max_run_id, max_instance_id))

        for line in source_db.iterdump():

            if line.startswith("CREATE TABLE"):
                try:
                    tablename = re.search("CREATE TABLE \"(\S+)\"",
                                          line).groups()[0]
                except AttributeError:
                    tablename = re.search("CREATE TABLE (\S+)",
                                          line).groups()[0]

                is_instance = False
                is_run = False
                if tablename == "run":
                    offset = run_id_offset
                    pos = "first"
                    is_run = True
                elif tablename == "tags":
                    offset = run_id_offset
                    pos = "first"
                elif tablename == "instance":
                    is_instance = True
                elif tablename == "tool_timings":
                    offset = instance_id_offset
                    pos = "last"
                elif tablename == "metric_timings":
                    offset = instance_id_offset
                    pos = "last"
                else:
                    # metric table
                    offset = instance_id_offset
                    pos = "last"

            elif line.startswith("INSERT INTO"):

                if is_instance:
                    i, n = re.search("VALUES\((\d+),(\d+),", line).groups()
                    if apply_run_filter(n, options.filter_method, min_run_id,
                                        max_run_id):
                        line = None
                    else:
                        line = re.sub(
                            "VALUES\({},{},".format(i, n),
                            "VALUES({},{},".format(
                                int(i) + instance_id_offset,
                                int(n) + run_id_offset), line)
                else:
                    if pos == "last":
                        n = re.search(",(\d+)\)", line).groups()[0]
                        line = re.sub(",{}\)".format(n),
                                      ",{})".format(int(n) + offset), line)
                    elif pos == "first":
                        n = re.search("VALUES\((\d+),", line).groups()[0]
                        line = re.sub("VALUES\({},".format(n),
                                      "VALUES({},".format(int(n) + offset),
                                      line)
                        if is_run:
                            if apply_run_filter(n, options.filter_method,
                                                min_run_id, max_run_id):
                                line = None

            if line is not None:
                print(line)

        cc = source_db.cursor()
        run_id_offset += max_run_id
        instance_id_offset += max_instance_id

        E.info("{}: updated offsets to run_id={}, instance_id={}".format(
            database, run_id_offset, instance_id_offset))

    E.stop()
def purge_run_id(run_id, url, dry_run=False, schemas=None):
    """remove a run from a database.
    """
    engine = sqlalchemy.create_engine(url)
    connection = engine.connect()

    # automap
    metadata = sqlalchemy.MetaData()
    metadata.reflect(engine)
    base = automap_base(metadata=metadata)
    base.prepare()

    if schemas is None:
        insp = reflection.Inspector.from_engine(engine)
        schemas = insp.get_schema_names()
        # note: default sqlite schema is "main"
        if 'public' in schemas:
            schemas.remove('public')
        if 'information_schema' in schemas:
            schemas.remove('information_schema')

    E.debug("getting instance_id list of run_id={}".format(run_id))
    instance_ids = set(get_instance_ids_for_run_id(run_id, engine))
    E.debug("found {} instances for run_id={}".format(len(instance_ids),
                                                      run_id))
    non_metric_tables = [
        'run', 'arvados_job', 'instance', 'binary_data', 'metric_timings',
        'tool_timings', 'metric_storage', 'tags'
    ]

    # delete from tables with field "instance_id"
    if instance_ids:
        for schema in schemas:
            # automap the schema
            metadata_schema = sqlalchemy.MetaData()
            metadata_schema.reflect(engine, schema=schema)
            base_schema = automap_base(metadata=metadata_schema)
            base_schema.prepare()
            for table_name in list(base_schema.metadata.tables.keys()):
                table = sqlalchemy.Table(table_name,
                                         metadata_schema,
                                         autoload=True)
                if "instance_id" not in table.c:
                    continue
                E.info("deleting data in {}".format(table_name))
                delete = table.delete().where(
                    table.c.instance_id.in_(instance_ids))
                # E.debug(delete)
                if not dry_run:
                    connection.execute(delete)

    # delete from tables with field "run_id"
    for table_name in base.metadata.tables.keys():
        table = sqlalchemy.Table(table_name, metadata, autoload=True)
        if "run_id" not in table.c:
            continue
        E.info("deleting data in {} for run_id {}".format(table_name, run_id))
        delete = table.delete().where(table.c.run_id == run_id)
        # E.debug(delete)
        if not dry_run:
            connection.execute(delete)

    table = sqlalchemy.Table('run', metadata, autoload=True)
    delete = table.delete().where(table.c.id == run_id)
    E.info("deleting data in 'run' for id {}".format(run_id))
    # E.debug(delete)
    if not dry_run:
        connection.execute(delete)
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-l", "--logfile", dest="logfile", type="string",
                      help="name of logfile [default=%default]")

    parser.add_option("-t", "--time", dest="time", type="choice",
                      choices=("seconds", "milliseconds"),
                      help="time to show [default=%default]")

    parser.add_option(
        "--no-reset", dest="reset", action="store_false",
        help="do not reset counters when a new pipeline run started "
        "The default is to reset so that only the counts from the latest "
        "pipeline execution are show "
        "[default=%default]")

    parser.add_option(
        "-f", "--filter-method", dest="filter", type="choice",
        choices=("unfinished", "running", "completed", "all"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-s", "--sort-order", dest="sort_order", type="choice",
        choices=("object", "ncalls", "duration", "percall", "running"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-i", "--ignore-errors", dest="ignore_errors", action="store_true",
        help="ignore errors [default=%default]")

    parser.set_defaults(sections=[],
                        logfile="pipeline.log",
                        filter="all",
                        reset=True,
                        sort_order="duration",
                        time="seconds")

    (options, args) = E.start(parser, argv)

    if options.sections:
        profile_sections = options.sections
    else:
        profile_sections = ("task", "job")

    counts = {}
    for section in profile_sections:
        counts[section] = collections.defaultdict(Counter)

    def line_grouper(filename):
        rx = re.compile("\d{4}-\d{2}-\d{2} ")
        with IOTools.open_file(filename) as infile:
            last_line = None
            for line in infile:
                line = line.strip()
                if not rx.match(line):
                    last_line = " ".join((last_line, line))
                else:
                    if last_line:
                        yield last_line
                    last_line = line
            yield last_line

    for line in line_grouper(options.logfile):

        data = line.split()
        if len(data) < 5:
            continue
        print(line)
        date, time, level, pipeline, source = data[:5]

        if re.search("output generated by", line):
            if options.reset:
                E.info("resetting counts at line=%s" % line[:-1])
                for section in profile_sections:
                    counts[section] = collections.defaultdict(Counter)
            continue

        # filter for log messages from task module
        if source != "task":
            continue

        dt = datetime.datetime.strptime(
            " ".join((date, time)), "%Y-%m-%d %H:%M:%S,%f")

        msg = "".join(data[5:])

        started_task, completed_task, started_job, completed_job = \
            (None, None, None, None)

        if re.search("task.log_at_level.\d+Task=(\S+)", msg):
            checked_task = re.search(
                "task.log_at_level.\d+Task=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg):
            started_infiles, started_job, missing = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg).groups()
        elif re.search("Taskentersqueue=(\S+)", msg):
            started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\]completed", msg):
            completed_infiles, completed_job = re.search(
                "Job=\[(\S+)->(\S+)\]completed", msg).groups()
        elif re.search("CompletedTask=(\S+)", msg):
            completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0]
        elif re.search("UptodateTask=(\S+)", msg):
            completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0]
        else:
            continue

        try:
            if started_task:
                counts["task"][(pipeline, started_task)].add(True, dt, started_task)
            elif completed_task:
                counts["task"][(pipeline, completed_task)].add(False, dt, completed_task)
            elif started_job:
                counts["job"][(pipeline, started_job)].add(True, dt, started_job)
            elif completed_job:
                counts["job"][(pipeline, completed_job)].add(False, dt, completed_job)
            else:
                raise ValueError("unknown action")
        except ValueError as msg:
            if not options.ignore_errors:
                raise ValueError(str(msg) + "\nat line %s" % line)

    def to_milliseconds(d):
        return d.seconds + d.microseconds / 1000

    def to_seconds(d):
        return d.seconds + d.microseconds / 1000000

    if options.time == "milliseconds":
        f = to_milliseconds
    elif options.time == "seconds":
        f = to_seconds

    for section in profile_sections:
        running = []
        rows = []
        for objct, c in list(counts[section].items()):

            # apply filters
            if options.filter in ("unfinished", "running") and c.running == 0:
                continue

            d = f(c.duration)
            if c.calls > 0:
                percall = "%6.3f" % (d / float(c.calls))
            else:
                percall = "na"

            rows.append((section,
                         objct[0],
                         objct[1],
                         c.calls,
                         d,
                         percall,
                         c.running))
            running.extend([x for x, y in c._started.items() if y != 0])

        header = ("section", "pipeline", "object", "ncalls",
                  "duration", "percall", "running")

        options.stdout.write("\t".join((header)) + "\n")
        idx = header.index(options.sort_order)
        rows = sorted(rows, key=lambda x: x[idx])

        options.stdout.write("\n".join(
            ["\t".join(map(str, x)) for x in rows]) + "\n")

        options.stdout.write("#//\n\n")

        if running:
            options.stdout.write("# running %ss\n" % section)
            options.stdout.write("\n".join(map(str, running)) + "\n")
            options.stdout.write("#//\n\n")

    E.stop()