def run_on_cluster(to_cluster): if to_cluster: P.start_session() try: yield finally: P.close_session() else: yield
def setUp(self): TestExecutionRunLocal.setUp(self) P.start_session()
def main(argv=None): parser = get_option_parser() (options, args) = E.start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 P.get_parameters() P.start_session() if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] statements = [build_command(x) for x in data] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.stop() sys.exit(0) P.run(statements) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = iotools.open_file(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = iotools.open_file(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.stop()
def main(argv=sys.argv): TASKS = {} for label, collection in [("tool", map_tool_to_runner), ("metric", map_metric_to_runner), ("collate", map_collate_to_runner), ("split", map_split_to_runner)]: for key, f in list(collection.items()): k = "{}_{}".format(label, key) if k in TASKS: raise ValueError("duplicate keys in TASK: {} {} {}".format( k, TASKS[k], f)) TASKS[k] = f parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-file", dest="input_files", type="string", action="append", help="input file. Can be used more than once [%default]") parser.add_option( "-s", "--input-slot", dest="input_slots", type="string", action="append", help= "input slot. Must be used as often as input_files for tools [%default]" ) parser.add_option( "-o", "--output-file", dest="output_files", type="string", action="append", help="output file. Can be used more than once [%default]") parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="show statement to be executed, do not execute [%default]") parser.add_option("--engine", dest="engine", type="choice", choices=("local", "arvados"), help="engine to use [%default]") parser.add_option("-t", "--task", dest="task", type="choice", choices=sorted(TASKS.keys()), help="task to run [%default]") parser.add_option("-l", "--list-tasks", dest="list_tasks", action="store_true", help="list all available tasks and exit [%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.set_defaults( input_files=[], input_slots=[], output_files=[], engine="local", dry_run=False, task=None, always_mount=False, ) (options, args) = E.start(parser, argv, add_cluster_options=True) if options.list_tasks: options.stdout.write("available_tasks\n{}\n".format("\n".join( sorted(TASKS.keys())))) E.stop() return if len(options.input_files) == 0: raise ValueError("no input files specified, use --input-file") if len(options.output_files) == 0: raise ValueError("no output files specified, use --output-file") if options.task is None: raise ValueError("please specify a task to run (--task)") P.get_parameters() if options.engine == "arvados": raise ValueError("arvados support disabled") # crunch_json = Arvados.build_crunch_script(argv) crunch_json = None retval = E.run('arv-crunch-job --job="$(cat {})"'.format(crunch_json)) if retval != 0: raise ValueError("error while executing") os.unlink(crunch_json) E.stop() return retval # Start SGE session if not options.without_cluster: P.start_session() params = dict(parse_args(args)) signal.signal(signal.SIGINT, cleanup) # redirect all mount points in parameters and input files. mountpoint = redirect2mounts([params, options.input_files], always_mount=options.always_mount) mountpoint = redirect_defaults2mountpoint(mountpoint) # register mountpoint with pipeline P.PARAMS["mount_point"] = mountpoint P.PARAMS["dryrun"] = options.dry_run try: # instantiate task runner runner = TASKS[options.task](**params) if len(options.output_files) == 0: tmpfile = tempfile.NamedTemporaryFile(delete=False) os.unlink(tmpfile.name) options.output_files.append(tmpfile.name) if options.task.startswith("tool"): if len(options.input_slots) != len(options.input_files): raise ValueError( "for tools, provide the same number as input slots as there" "are input files (--input-slots)") input_files = dict(zip(options.input_slots, options.input_files)) runner.register_input(input_files) # check if expected is in params runner(list(input_files.values()), options.output_files[0]) elif options.task.startswith("metric"): runner(options.input_files[0], options.output_files[0]) elif options.task.startswith("collate"): runner(options.input_files, options.output_files[0]) elif options.task.startswith("split"): runner(options.input_files[0], options.output_files) # stop SGE session P.close_session() finally: cleanup() E.stop()