Exemplo n.º 1
0
def join_statements(statements, infile, outfile=None):
    '''join a chain of statements into a single statement.

    Each statement contains an @IN@ or a @OUT@ placeholder or both.
    These will be replaced by the names of successive temporary files.

    In the first statement, @IN@ is replaced with `infile` and, if given,
    the @OUT@ is replaced by outfile in the last statement.

    Arguments
    ---------
    statements : list
        A list of command line statements.
    infile : string
        Filename of the first data set.
    outfile : string
        Filename of the target data set.

    Returns
    -------
    last_file : string
        Filename of last file created, outfile, if given.
    statement : string
        A command line statement built from merging the statements
    cleanup : string
        A command line statement for cleaning up.

    '''

    prefix = get_temp_filename()

    pattern = "%s_%%i" % prefix

    result = []
    for x, statement in enumerate(statements):
        s = statement
        if x == 0:
            if infile is not None:
                s = re.sub("@IN@", infile, s)
        else:
            s = re.sub("@IN@", pattern % x, s)
            if x > 2:
                s = re.sub("@IN-2@", pattern % (x - 2), s)
            if x > 1:
                s = re.sub("@IN-1@", pattern % (x - 1), s)

        s = re.sub("@OUT@", pattern % (x + 1), s).strip()

        if s.endswith(";"):
            s = s[:-1]
        result.append(s)

    result = "; ".join(result)
    last_file = pattern % (x + 1)
    if outfile:
        result = re.sub(last_file, outfile, result)
        last_file = outfile

    assert prefix != ""
    return last_file, result, "rm -f %s*" % prefix
Exemplo n.º 2
0
    def run(self, statement_list):

        benchmark_data = []
        # run statements through array interface
        jobsfile = get_temp_filename(dir=self.workingdir, clear=True) + ".jobs"

        with open(jobsfile, "w") as outf:
            outf.write("\n".join(statement_list))

        master_statement = ("CMD=$(awk \"NR==$SGE_TASK_ID\" {jobsfile}); "
                            "eval $CMD".format(**locals()))

        full_statement, job_path = self.build_job_script(master_statement)

        jt = self.setup_job(self.options["cluster"])

        stdout_path, stderr_path = set_drmaa_job_paths(jt, job_path)

        start_time = time.time()
        job_id, stdout, stderr, resource_usage = self.run_array_job(
            self.session,
            jt,
            stdout_path,
            stderr_path,
            full_statement,
            start=0,
            end=len(statement_list),
            increment=1)
        end_time = time.time()

        benchmark_data.extend(
            self.collect_benchmark_data(stdout,
                                        statement_list,
                                        start_time,
                                        end_time,
                                        resource_usage=resource_usage))
        try:
            os.unlink(jobsfile)
        except OSError:
            pass

        return benchmark_data
Exemplo n.º 3
0
def _pickle_args(args, kwargs):
    ''' Pickle a set of function arguments. Removes any kwargs that are
    arguements to submit first. Returns a tuple, the first member of which
    is the key word arguements to submit, the second is a file name
    with the picked call arguements '''

    use_args = [
        "to_cluster", "logfile", "job_options", "job_queue", "job_threads",
        "job_memory"
    ]

    submit_args = {}

    for arg in use_args:
        if arg in kwargs:
            submit_args[arg] = kwargs[arg]
            del kwargs[arg]

    args_file = get_temp_filename(shared=True)
    pickle.dump([args, kwargs], open(args_file, "wb"))
    return (submit_args, args_file)
Exemplo n.º 4
0
    def build_job_script(self, statement):
        '''build job script from statement.

        returns (name_of_script, stdout_path, stderr_path)
        '''
        tmpfilename = get_temp_filename(dir=self.workingdir, clear=True)
        tmpfilename = tmpfilename + ".sh"

        tmpdir = get_temp_dir(clear=True)

        expanded_statement, cleanup_funcs = self.expand_statement(statement)

        with open(tmpfilename, "w") as tmpfile:
            # disabled: -l -O expand_aliases\n" )

            # make executable
            tmpfile.write("#!/bin/bash -eu\n")
            if not self.ignore_pipe_errors:
                tmpfile.write("set -o pipefail\n")

            os.chmod(tmpfilename, stat.S_IRWXG | stat.S_IRWXU)

            tmpfile.write("\ncd {}\n".format(self.workingdir))
            if self.output_directories is not None:
                for outdir in self.output_directories:
                    if outdir:
                        tmpfile.write("\nmkdir -p {}\n".format(outdir))

            # create and set system scratch dir for temporary files
            tmpfile.write("umask 002\n")
            tmpfile.write("mkdir -p {}\n".format(tmpdir))
            tmpfile.write("export TMPDIR={}\n".format(tmpdir))
            cleanup_funcs.append(
                ("clean_temp", "{{ rm -rf {}; }}".format(tmpdir)))

            # output times whenever script exits, preserving
            # return status
            cleanup_funcs.append(
                ("info", "{ echo 'benchmark'; hostname; times; }"))
            for cleanup_func, cleanup_code in cleanup_funcs:
                tmpfile.write("\n{}() {}\n".format(cleanup_func, cleanup_code))

            tmpfile.write("\nclean_all() {{ {}; }}\n".format("; ".join(
                [x[0] for x in cleanup_funcs])))

            tmpfile.write("\ntrap clean_all EXIT\n\n")

            if self.job_memory != "unlimited":
                # restrict virtual memory
                # Note that there are resources in SGE which could do this directly
                # such as v_hmem.
                # Note that limiting resident set sizes (RSS) with ulimit is not
                # possible in newer kernels.
                # -v and -m accept memory in kb
                requested_memory_kb = max(
                    1000,
                    int(
                        math.ceil(
                            IOTools.human2bytes(self.job_memory) / 1024 *
                            self.job_threads)))
                # unsetting error exit as often not permissions
                tmpfile.write("set +e\n")
                tmpfile.write(
                    "ulimit -v {} > /dev/null \n".format(requested_memory_kb))
                tmpfile.write(
                    "ulimit -m {} > /dev/null \n".format(requested_memory_kb))
                # set as hard limit
                tmpfile.write("ulimit -H -v > /dev/null \n")
                tmpfile.write("set -e\n")

            if self.shellfile:

                # make sure path exists that we want to write to
                tmpfile.write("mkdir -p $(dirname \"{}\")\n".format(
                    self.shellfile))

                # output low-level debugging information to a shell log file
                tmpfile.write('echo "%s : START -> %s" >> %s\n' %
                              (self.job_name, tmpfilename, self.shellfile))
                # disabled - problems with quoting
                # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
                # (shellquote(statement), self.shellfile) )
                tmpfile.write("set | sed 's/^/%s : /' &>> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write("pwd | sed 's/^/%s : /' &>> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write("hostname | sed 's/^/%s: /' &>> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write("cat /proc/meminfo | sed 's/^/%s: /' &>> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write('echo "%s : END -> %s" >> %s\n' %
                              (self.job_name, tmpfilename, self.shellfile))
                tmpfile.write("ulimit | sed 's/^/%s: /' &>> %s\n" %
                              (self.job_name, self.shellfile))

            job_path = os.path.abspath(tmpfilename)

            tmpfile.write(expanded_statement)
            tmpfile.write("\n\n")
            tmpfile.close()

        return statement, job_path
Exemplo n.º 5
0
    def expand_statement(self, statement):
        '''add generic commands before and after statement.

        The method scans the statement for arvados mount points and
        inserts appropriate prefixes to make sure that the mount point
        exists.

        Arguments
        ---------
        statement : string
            Command line statement to expand

        Returns
        -------
        statement : string
            The expanded statement.

        '''

        setup_cmds = []
        teardown_cmds = []
        cleanup_funcs = []

        # create local scratch if it does not already exists. Note that
        # directory itself will be not deleted while its contents should
        # be cleaned up.
        setup_cmds.append("umask 002")
        setup_cmds.append("mkdir -p {}".format(PARAMS["tmpdir"]))

        if "arv=" in statement:

            # Todo: permit setting this in params
            arvados_api_token = os.environ.get("ARVADOS_API_TOKEN", None)
            arvados_api_host = os.environ.get("ARVADOS_API_HOST", None)
            if not arvados_api_token:
                raise ValueError(
                    "arvados mount encountered in statement {}, "
                    "but ARVADOS_API_TOKEN not defined".format(statement))

            if not arvados_api_host:
                raise ValueError(
                    "arvados mount encountered in statement {}, "
                    "but ARVADOS_API_HOST not defined".format(statement))

            mountpoint = get_temp_filename(clear=True)

            arvados_options = "--disable-event-listening --read-only"
            setup_cmds.append("\n".join(
                ('export ARVADOS_API_TOKEN="{arvados_api_token}"',
                 'export ARVADOS_API_HOST="{arvados_api_host}"',
                 'export ARVADOS_API_HOST_INSECURE=true',
                 'export ARVADOS_MOUNT_POINT="{mountpoint}"',
                 'mkdir -p "{mountpoint}"',
                 'arv-mount {arvados_options} "{mountpoint}" 2>> /dev/null'
                 )).format(**locals()))

            statement = re.sub("arv=", mountpoint + "/", statement)

            # "arv-mount --unmount {mountpoint}" not available in newer
            # arvados installs (0.1.20170707152712), so keep using
            # fusermount. However, do not fail if you can't clean up, as
            # there are arvados racing issues.
            cleanup_funcs.append(("unmount_arvados", '''{{
                                  set +e &&
                                  fusermount -u {mountpoint} &&
                                  rm -rf {mountpoint} &&
                                  set -e
                                  }}'''.format(**locals())))

        if "job_condaenv" in self.options:
            # In conda < 4.4 there is an issue with parallel activations,
            # see https://github.com/conda/conda/issues/2837 .
            # This has been fixed in conda 4.4, but we are on conda
            # 4.3, presumably because we are still on py35. A work-around
            # to source activate is to add the explicit path of the environment
            # in version >= 4.4, do
            # setup_cmds.append(
            #     "conda activate {}".format(self.options["job_condaenv"]))
            # For old conda versions (note this will not work for tools that require
            # additional environment variables)
            setup_cmds.append("export PATH={}:$PATH".format(
                os.path.join(
                    get_conda_environment_directory(
                        self.options["job_condaenv"]), "bin")))

        statement = "\n".join(
            ("\n".join(setup_cmds), statement, "\n".join(teardown_cmds)))

        return statement, cleanup_funcs