예제 #1
0
    def download(filename):
        """
        Download each file
        """

        try:

            if (not options.overwrite) and out_store.exists(filename):
                # File exists. But make sure its size is correct.

                if not options.check_size:
                    # Skip existing file. No need to check the length.
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

                out_size = out_store.get_size(filename)
                in_size = in_store.get_size(filename)
                if out_size != in_size:
                    # Complain about size mismatch and copy
                    RealtimeLogger.warning(
                        "Redownloading {}! Size was {} and not {}!".format(
                            filename, out_size, in_size))
                else:
                    # Skip existing file
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

            # Make a temp file
            (handle,
             path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir())
            os.close(handle)

            RealtimeLogger.debug("Download {}".format(filename))

            # Download
            in_store.read_input_file(filename, path)
            # Store
            out_store.write_output_file(path, filename)

            # Clean up
            os.unlink(path)

        except:
            # Put all exception text into an exception and raise that
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))

        RealtimeLogger.info("Copied {}".format(filename))
예제 #2
0
    def call_with_docker(self, job, args, work_dir, outfile, errfile,
                         check_output, tool_name):
        """
        
        Thin wrapper for docker_call that will use internal lookup to
        figure out the location of the docker file.  Only exposes docker_call
        parameters used so far.  expect args as list of lists.  if (toplevel)
        list has size > 1, then piping interface used
        
        Does support redirecting output to outfile, unless check_output is
        used, in which case output is captured.
        
        """

        RealtimeLogger.info(
            truncate_msg("Docker Run: {}".format(" | ".join(" ".join(x)
                                                            for x in args))))
        start_time = timeit.default_timer()

        # we use the first argument to look up the tool in the docker map
        # but allow overriding of this with the tool_name parameter
        name = tool_name if tool_name is not None else args[0][0]
        tool = self.docker_tool_map[name]

        # We keep an environment dict
        environment = {}

        # And an entry point override
        entrypoint = None

        # And a volumes dict for mounting
        volumes = {}

        # And a working directory override
        working_dir = None

        # breaks Rscript.  Todo: investigate how general this actually is
        if name != 'Rscript':
            # vg uses TMPDIR for temporary files
            # this is particularly important for gcsa, which makes massive files.
            # we will default to keeping these in our working directory
            environment['TMPDIR'] = '.'

        if name == 'Rscript':
            # The R dockers by default want to install packages in non-writable directories. Sometimes.
            # Make sure a writable directory which exists is used.
            environment['R_LIBS'] = '/tmp'

        if name == 'vg':
            environment['VG_FULL_TRACEBACK'] = '1'

        # ugly hack for platypus, as default container doesn't have executable in path
        if tool == 'quay.io/biocontainers/platypus-variant:0.8.1.1--htslib1.7_1' and \
           args[0][0] == 'Platypus.py':
            args[0][
                0] = '/usr/local/share/platypus-variant-0.8.1.1-1/Platypus.py'

        # Force all dockers to run sort in a consistent way
        environment['LC_ALL'] = 'C'

        # set our working directory map
        if work_dir is not None:
            volumes[os.path.abspath(work_dir)] = {
                'bind': '/data',
                'mode': 'rw'
            }
            working_dir = '/data'

        if outfile is not None:
            # We need to send output to a file object

            assert (not check_output)

            # We can't just redirect stdout of the container from the API, so
            # we do something more complicated.

            # Now we need to populate an FD that spits out the container output.
            output_fd = None

            # We may be able to use a FIFO, or we may need a network connection.
            # FIFO sharing between host and container only works on Linux.
            use_fifo = (platform.system() == 'Linux')

            if use_fifo:
                # On a Linux host we can just use a FIFO from the container to the host

                # Set up a FIFO to receive it
                fifo_dir = tempfile.mkdtemp()
                fifo_host_path = os.path.join(fifo_dir, 'stdout.fifo')
                os.mkfifo(fifo_host_path)

                # Mount the FIFO in the container.
                # The container doesn't actually have to have the mountpoint directory in its filesystem.
                volumes[fifo_dir] = {'bind': '/control', 'mode': 'rw'}

                # Redirect the command output by tacking on another pipeline stage
                parameters = args + [['dd', 'of=/control/stdout.fifo']]

                # Open the FIFO into nonblocking mode. See
                # <https://stackoverflow.com/a/5749687> and
                # <http://shallowsky.com/blog/programming/python-read-characters.html>
                output_fd = os.open(fifo_host_path,
                                    os.O_RDONLY | os.O_NONBLOCK)

            else:
                # On a Mac host we can't because of https://github.com/docker/for-mac/issues/483
                # We need to go over the network instead.

                # Open an IPv4 TCP socket, since we know Docker uses IPv4 only
                listen_sock = socket.socket(socket.AF_INET)
                # Bind it to an OS-selected port on all interfaces, since we can't determine the Docker interface
                # TODO: socket.INADDR_ANY ought to work here but is rejected for being an int.
                listen_sock.bind(('', 0))

                # Start listening
                listen_sock.listen(1)

                # Get the port we got given
                listen_port = listen_sock.getsockname()[1]

                # Generate a random security cookie. Since we can't really stop
                # Internet randos from connecting to our socket, we bail out on
                # any connection that doesn't start with this cookie and a newline.
                security_cookie = str(uuid.uuid4())

                # Redirect the command output to that port using Bash networking
                # Your Docker needs to be 18.03+ to support host.docker.internal
                # Your container needs to have bash with networking support
                parameters = args + [[
                    'bash', '-c',
                    'exec 3<>/dev/tcp/host.docker.internal/{}; cat <(echo {}) - >&3'
                    .format(listen_port, security_cookie)
                ]]

                RealtimeLogger.debug(
                    "Listening on port {} for output from Docker container".
                    format(listen_port))

                # We can't populate the FD until we accept, which we can't do
                # until the Docker comes up and is trying to connect.

            RealtimeLogger.debug("Final Docker command: {}".format(" | ".join(
                " ".join(x) for x in parameters)))

            # Start the container detached so we don't wait on it
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            RealtimeLogger.debug("Asked for container {}".format(container.id))

            if not use_fifo:
                # Try and accept a connection from the container.
                # Make sure there's a timeout so we don't accept forever
                listen_sock.settimeout(10)

                for attempt in range(3):

                    connection_sock, remote_address = listen_sock.accept()

                    RealtimeLogger.info(
                        "Got connection from {}".format(remote_address))

                    # Set a 10 second timeout for the cookie
                    connection_sock.settimeout(10)

                    # Check the security cookie
                    received_cookie_and_newline = connection_sock.recv(
                        len(security_cookie) + 1)

                    if received_cookie_and_newline != security_cookie + "\n":
                        # Incorrect security cookie.
                        RealtimeLogger.warning(
                            "Received incorect security cookie message from {}"
                            .format(remote_address))
                        continue
                    else:
                        # This is the container we are looking for
                        # Go into nonblocking mode which our read code expects
                        connection_sock.setblocking(True)
                        # Set the FD
                        output_fd = connection_sock.fileno()
                        break

                if output_fd is None:
                    # We can't get ahold of the Docker in time
                    raise RuntimeError(
                        "Could not establish network connection for Docker output!"
                    )

            # If the Docker container goes badly enough, it may not even open
            # the other end of the connection. So we can't just wait for it to
            # EOF before checking on the Docker.

            # Now read ought to throw if there is no data. But
            # <https://stackoverflow.com/q/38843278> and some testing suggest
            # that this doesn't happen, and it just looks like EOF. So we will
            # watch out for that.

            try:
                # Prevent leaking FDs

                # If this is set, and there is no data in the pipe, decide that no data is coming
                last_chance = False
                # If this is set, we have seen data in the pipe, so the other
                # end must have opened it and will eventually close it if it
                # doesn't run forever.
                saw_data = False

                while True:
                    # While there still might be data in the pipe

                    if output_fd is not None:
                        # Select on the pipe with a timeout, so we don't spin constantly waiting for data
                        can_read, can_write, had_error = select.select(
                            [output_fd], [], [output_fd], 10)

                    if len(can_read) > 0 or len(had_error) > 0:
                        # There is data available or something else weird about our FIFO.

                        try:
                            # Do a nonblocking read. Since we checked with select we never should get "" unless there's an EOF.
                            data = os.read(output_fd, 4096)

                            if data == "":
                                # We didn't throw and we got nothing, so it must be EOF.
                                RealtimeLogger.debug("Got EOF")
                                break

                        except OSError as err:
                            if err.errno in [errno.EAGAIN, errno.EWOULDBLOCK]:
                                # There is no data right now
                                data = None
                            else:
                                # Something else has gone wrong
                                raise err

                    else:
                        # There is no data available. Don't even try to read. Treat it as if a read refused to block.
                        data = None

                    if data is not None:
                        # Send our data to the outfile
                        outfile.write(data)
                        saw_data = True
                    elif not saw_data:
                        # We timed out and there has never been any data. Maybe the container has died/never started?

                        if last_chance:
                            # The container has been dead for a while and nothing has arrived yet. Assume no data is coming.
                            RealtimeLogger.warning(
                                "Giving up on output form container {}".format(
                                    container.id))
                            break

                        # Otherwise, check on it
                        container.reload()

                        if container.status not in [
                                'created', 'restarting', 'running', 'removing'
                        ]:
                            # The container has stopped. So what are we doing waiting around for it?

                            # Wait one last time for any lingering data to percolate through the FIFO
                            time.sleep(10)
                            last_chance = True
                            continue

            finally:
                # No matter what happens, close our end of the connection
                os.close(output_fd)

                if not use_fifo:
                    # Also close the listening socket
                    listen_sock.close()

            # Now our data is all sent.
            # Wait on the container and get its return code.
            return_code = container.wait()

            if use_fifo:
                # Clean up the FIFO files
                os.unlink(fifo_host_path)
                os.rmdir(fifo_dir)

        else:
            # No piping needed.

            if len(args) == 1:
                # split off first argument as entrypoint (so we can be oblivious as to whether
                # that happens by default)
                parameters = [] if len(args[0]) == 1 else args[0][1:]
                entrypoint = args[0][0]
            else:
                # can leave as is for piped interface which takes list of args lists
                # and doesn't worry about entrypoints since everything goes through bash -c
                # todo: check we have a bash entrypoint!
                parameters = args

            # Run the container and dump the logs if it fails.
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            # Wait on the container and get its return code.
            return_code = container.wait()

        # When we get here, the container has been run, and stdout is either in the file object we sent it to or in the Docker logs.
        # stderr is always in the Docker logs.

        if return_code != 0:
            # What were we doing?
            command = " | ".join(" ".join(x) for x in args)

            # Dump logs
            RealtimeLogger.error(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
            RealtimeLogger.error("Dumping stderr...")
            for line in container.logs(stderr=True, stdout=False, stream=True):
                # Trim trailing \n
                RealtimeLogger.error(line[:-1])

            if not check_output and outfile is None:
                # Dump stdout as well, since it's not something the caller wanted as data
                RealtimeLogger.error("Dumping stdout...")
                for line in container.logs(stderr=False,
                                           stdout=True,
                                           stream=True):
                    # Trim trailing \n
                    RealtimeLogger.error(line[:-1])

            # Raise an error if it's not sucess
            raise RuntimeError(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
        elif errfile:
            # user wants stderr even if no crash
            for line in container.logs(stderr=True, stdout=False, stream=True):
                errfile.write(line)

        if check_output:
            # We need to collect the output. We grab it from Docker's handy on-disk buffer.
            # TODO: Bad Things can happen if the container logs too much.
            captured_stdout = container.logs(stderr=False, stdout=True)

        end_time = timeit.default_timer()
        run_time = end_time - start_time
        RealtimeLogger.info("Successfully docker ran {} in {} seconds.".format(
            " | ".join(" ".join(x) for x in args), run_time))

        if outfile:
            outfile.flush()
            os.fsync(outfile.fileno())

        if check_output is True:
            return captured_stdout
예제 #3
0
def run_chunk_alignment(job, context, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper,
                        chunk_filename_ids, chunk_id, indexes,
                        bam_output=False, gbwt_penalty=None, always_check_population=True, validate=False, fasta_dict_id=None):
                        
    """
    Align a chunk of reads.
    
    Takes a dict from index type to index file ID. Some indexes are extra and
    specifying them will change mapping behavior.
    """
                        

    RealtimeLogger.info("Starting {} alignment on {} chunk {}".format(mapper, sample_name, chunk_id))

    # How long did the alignment take to run, in seconds?
    run_time = None
    
    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # Download local input files from the remote storage container
    graph_file = os.path.join(work_dir, "graph.vg")

    # Work out what index files we need
    index_files = {}
    index_files['xg'] = graph_file + ".xg"
    if mapper == 'map' or mapper == 'mpmap':
        index_files['gcsa'] = graph_file + ".gcsa"
        index_files['lcp'] = index_files['gcsa'] + ".lcp"
        
        if 'gbwt' in indexes:
            # We have a GBWT haplotype index available.
            index_files['gbwt'] = graph_file + ".gbwt"
            
    if mapper == 'mpmap':
        if 'snarls' in indexes:
            # mpmap knows how to use the snarls, and we have them, so we should use them
            
            # Note that passing them will affect mapping, if using multiple
            # tracebacks. Since we only run single path mode, if multiple
            # tracebacks aren't used, mpmap will ignore the snarls.
            index_files['snarls'] = graph_file + ".snarls"
        
    if mapper == 'giraffe':
        index_files['minimizer'] = graph_file + ".min"
        index_files['distance'] = graph_file + ".dist"
        index_files['gbwt'] = graph_file + ".gbwt"
        if 'ggbwt' in indexes:
            index_files['ggbwt'] = graph_file + ".gg"
        
    for index_type in list(index_files.keys()):
        # Download each index file
        job.fileStore.readGlobalFile(indexes[index_type], index_files[index_type])
    
    # We need the sample reads (fastq(s) or gam) for alignment
    reads_files = []
    reads_ext = 'gam' if gam_input_reads else 'bam' if bam_input_reads else 'fq.gz'
    for j, chunk_filename_id in enumerate(chunk_filename_ids):
        reads_file = os.path.join(work_dir, 'reads_chunk_{}_{}.{}'.format(chunk_id, j, reads_ext))
        job.fileStore.readGlobalFile(chunk_filename_id, reads_file)
        reads_files.append(reads_file)
    
    # And a temp file for our aligner output
    if bam_output is False:
        output_file = os.path.join(work_dir, "{}_{}.gam".format(sample_name, chunk_id))
    else:
        output_file = os.path.join(work_dir, "{}_{}.bam".format(sample_name, chunk_id))
    
    # Open the file stream for writing
    with open(output_file, 'wb') as alignment_file:

        # Start the aligner and have it write to the file

        # Plan out what to run
        vg_parts = []
        
        if mapper == 'mpmap':
            vg_parts += ['vg', 'mpmap']
            vg_parts += context.config.mpmap_opts
            if ('-F' not in vg_parts and '--output-fmt' not in vg_parts) or 'GAM' not in vg_parts:
                RealtimeLogger.warning('Adding --output-fmt GAM to mpmap options as only GAM output supported')
                vg_parts += ['--output-fmt', 'GAM']
        elif mapper == 'map':
            vg_parts += ['vg', 'map'] 
            vg_parts += context.config.map_opts
        elif mapper == 'giraffe':
            vg_parts += ['vg', 'giraffe'] 
            vg_parts += context.config.giraffe_opts
        else:
            raise RuntimeError('Unimplemented mapper "{}"'.format(mapper))
            
        for reads_file in reads_files:
            input_flag = '-G' if gam_input_reads else '-b' if bam_input_reads else '-f'
            vg_parts += [input_flag, os.path.basename(reads_file)]
        
        vg_parts += ['-t', str(context.config.alignment_cores)]
        vg_parts += ['-R', 'SM:{}'.format(sample_name)]
        
        # Override the -i flag in args with the --interleaved command-line flag
        if interleaved is True and '-i' not in vg_parts and '--interleaved' not in vg_parts:
            vg_parts += ['-i']
        elif interleaved is False and 'i' in vg_parts:
            del vg_parts[vg_parts.index('-i')]
        if interleaved is False and '--interleaved' in vg_parts:
            del vg_parts[vg_parts.index('--interleaved')]

        # Override the --surject-to option
        if bam_output is True and '--surject-to' not in vg_parts and mapper != 'giraffe':
            vg_parts += ['--surject-to', 'bam']
        elif bam_output is True and '--output-format' not in vg_parts and mapper == 'giraffe':
            vg_parts += ['--output-format', 'BAM']
        elif bam_output is False and '--surject-to' in vg_parts:
            sidx = vg_parts.index('--surject-to')
            del vg_parts[sidx]
            del vg_parts[sidx]

        # Turn indexes into options
        type_to_option = {
            'gbwt': '--gbwt-name',
            'xg': '-x',
            'gcsa': '-g',
            'lcp': None,
            'distance': '-d',
            'minimizer': '-m',
            'ggbwt': '--graph-name',
            'snarls': '--snarls'
        }
        for index_type, index_file in list(index_files.items()):
            if type_to_option[index_type] is not None:
                vg_parts += [type_to_option[index_type], os.path.basename(index_file)]

        if 'gbwt' in index_files:
            # We may have a GBWT recombination rate/penalty override
            if gbwt_penalty is not None:
                # We have a recombination penalty value to apply
                if '--recombination-penalty' in vg_parts:
                    # Make sure to strip out the penalty if it is in args already
                    sidx = vg_parts.index('--recombination-penalty')
                    del vg_parts[sidx]
                    del vg_parts[sidx]
                    
                # Both map and mpmap take this option
                vg_parts += ['--recombination-penalty', str(gbwt_penalty)]
                
            if mapper == 'mpmap' and always_check_population:
                # Always try to population-score even unambiguous reads
                # mpmap can do this
                vg_parts += ['--always-check-population']
        
        if fasta_dict_id is not None and bam_output is True:
            fasta_dict_file = os.path.join(work_dir, 'fasta.dict')
            job.fileStore.readGlobalFile(fasta_dict_id, fasta_dict_file)
            vg_parts += ['--ref-paths', os.path.basename(fasta_dict_file)]
            
        
        RealtimeLogger.info(
            "Running VG for {} against {}: {}".format(sample_name, graph_file,
            " ".join(vg_parts)))
        
        # Mark when we start the alignment
        start_time = timeit.default_timer()
        command = vg_parts
        try:
            context.runner.call(job, command, work_dir = work_dir, outfile=alignment_file)
            end_time = timeit.default_timer()
            if validate:
                alignment_file.flush()
                context.runner.call(job, ['vg', 'validate', os.path.basename(index_files['xg']),
                                          '--gam', os.path.basename(output_file)], work_dir = work_dir)
        except:
            # Dump everything we need to replicate the alignment
            end_time = timeit.default_timer()
            logging.error("Mapping failed. Dumping files.")
            for index_file in list(index_files.values()):
                context.write_output_file(job, index_file)
            for reads_file in reads_files:
                context.write_output_file(job, reads_file)
            raise
        
        # Mark when it's done
        run_time = end_time - start_time

    paired_end = '-i' in vg_parts or '--interleaved' in vg_parts or len(chunk_filename_ids) > 1
    RealtimeLogger.info("Aligned {}. Process took {} seconds with {} vg-{}".format(
        output_file, run_time, 'paired-end' if paired_end else 'single-end', mapper))

    if 'id_ranges' in indexes and bam_output is False:
        # Break GAM into multiple chunks at the end. So we need the file
        # defining those chunks.
        id_ranges_file = os.path.join(work_dir, 'id_ranges.tsv')
        job.fileStore.readGlobalFile(indexes['id_ranges'], id_ranges_file)
        
        # Chunk the gam up by chromosome
        gam_chunks = split_gam_into_chroms(job, work_dir, context, index_files['xg'], id_ranges_file, output_file)
        
        # Write gam_chunks to store
        gam_chunk_ids = []
        for gam_chunk in gam_chunks:
            gam_chunk_ids.append(context.write_intermediate_file(job, gam_chunk))

        return gam_chunk_ids, run_time
    else:
        # We can just report one chunk of everything
        return [context.write_intermediate_file(job, output_file)], run_time