def parseManifest(path_to_manifest):
    require(
        os.path.exists(path_to_manifest),
        "[parseManifest]Didn't find manifest file, looked "
        "{}".format(path_to_manifest))
    allowed_file_types = ["fq", "bam"]

    #allowed_file_types = ["fq-gzp", "fq", "fa-gzp", "fa", "f5-tar", "bam"]

    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"),
                "[parse_line]Invalid {}".format(line))
        sample = line.strip().split("\t")
        require(
            len(sample) == 4,
            "[parse_line]Invalid, len(line) != 4, offending {}".format(line))
        file_type, sample_url, sample_label, sample_filesize = sample
        # check the file_type and the URL
        require(file_type in allowed_file_types,
                "[parse_line]Unrecognized file type {}".format(file_type))
        require(
            urlparse(sample_url).scheme and urlparse(sample_url),
            "Invalid URL passed for {}".format(sample_url))
        return Sample(file_type=file_type,
                      URL=sample_url,
                      label=sample_label,
                      file_size=human2bytes(sample_filesize))

    with open(path_to_manifest, "r") as fH:
        return map(
            parse_line,
            [x for x in fH if (not x.isspace() and not x.startswith("#"))])
Пример #2
0
def joint_genotype_and_filter(job, gvcfs, config):
    """
    Checks for enough disk space for joint genotyping, then calls the genotype and filter pipeline function.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param dict gvcfs: Dictionary of GVCFs {Sample ID: FileStoreID}
    :param Namespace config: Input parameters and reference FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.genome_dict          FilesStoreID for reference genome sequence dictionary file
        config.available_disk       Total available disk space
    :returns: FileStoreID for the joint genotyped and filtered VCF file
    :rtype: str
    """
    # Get the total size of genome reference files
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # Require at least 2.5x the sum of the individual GVCF files
    cohort_size = sum(gvcf.size for gvcf in gvcfs.values())
    require(int(2.5 * cohort_size + genome_ref_size) < config.available_disk,
            'There is not enough disk space to joint '
            'genotype samples:\n{}'.format('\n'.join(gvcfs.keys())))

    job.fileStore.logToMaster('Merging cohort into a single GVCF file')

    return job.addChildJobFn(genotype_and_filter, gvcfs, config).rv()
Пример #3
0
def parse_manifest(manifest_path):
    """
    Parse manifest file

    :param str manifest_path: Path to manifest file
    :return: samples
    :rtype: list[str, list]
    """
    samples = []
    with open(manifest_path, 'r') as f:
        for line in f:
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(
                    2 <= len(sample) <= 3, 'Bad manifest format! '
                    'Expected UUID\tURL1\t[URL2] (tab separated), got: {}'.
                    format(sample))
                uuid = sample[0]
                urls = sample[1:]
                for url in urls:
                    require(
                        urlparse(url).scheme and urlparse(url),
                        'Invalid URL passed for {}'.format(url))
                samples.append([uuid, urls])
    return samples
Пример #4
0
def joint_genotype_and_filter(job, gvcfs, config):
    """
    Checks for enough disk space for joint genotyping, then calls the genotype and filter pipeline function.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param dict gvcfs: Dictionary of GVCFs {Sample ID: FileStoreID}
    :param Namespace config: Input parameters and reference FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.genome_dict          FilesStoreID for reference genome sequence dictionary file
        config.available_disk       Total available disk space
    :returns: FileStoreID for the joint genotyped and filtered VCF file
    :rtype: str
    """
    # Get the total size of genome reference files
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # Require at least 2.5x the sum of the individual GVCF files
    cohort_size = sum(gvcf.size for gvcf in gvcfs.values())
    require(
        int(2.5 * cohort_size + genome_ref_size) < config.available_disk,
        'There is not enough disk space to joint '
        'genotype samples:\n{}'.format('\n'.join(gvcfs.keys())))

    job.fileStore.logToMaster('Merging cohort into a single GVCF file')

    return job.addChildJobFn(genotype_and_filter, gvcfs, config).rv()
def marginAlignRootJobFunction(job, config, sample):
    def cull_sample_files():
        if sample.file_type == "fq":
            config["sample_FileStoreID"] = job.addChildJobFn(
                urlDownlodJobFunction, sample.URL, disk=sample.file_size).rv()
            return None
        elif sample.file_type == "bam":
            bwa_alignment_fid = job.addChildJobFn(urlDownlodJobFunction,
                                                  sample.URL,
                                                  disk=sample.file_size).rv()
            config["sample_FileStoreID"] = job.addChildJobFn(
                getFastqFromBam, sample, disk=(2 * sample.file_size)).rv()
            return bwa_alignment_fid
        else:
            raise RuntimeError(
                "[marginAlignRootJobFunction]Unsupported sample file type %s" %
                sample.file_type)

    # download/import the reference
    config["reference_FileStoreID"] = job.addChildJobFn(
        urlDownlodJobFunction, config["ref"], disk=config["ref_size"]).rv()

    # cull the sample, which can be a fastq or a BAM this will be None if we are doing BWA alignment
    alignment_fid = cull_sample_files()

    # checks if we're doing alignments or variant calling
    if config["realign"] or config["caller"]:
        # download the input model, if given. Fail if no model is given and we're performing HMM realignment without
        # doing EM
        if config["hmm_file"] is not None:
            config["input_hmm_FileStoreID"] = job.addChildJobFn(
                urlDownlodJobFunction, config["hmm_file"], disk="10M").rv()
        else:
            if config["realign"]:
                require(
                    config["EM"],
                    "[marginAlignRootJobFunction]Need to specify an input model or "
                    "set EM to True to perform HMM realignment")
            config["input_hmm_FileStoreID"] = None

        # initialize key in config for trained model if we're performing EM
        if config["EM"]:
            config["normalized_trained_model_FileStoreID"] = None

    config["sample_label"] = sample.label
    config["reference_label"] = config["ref"]

    job.fileStore.logToMaster("[run_tool]Processing sample:{}".format(
        config["sample_label"]))
    job.fileStore.logToMaster("[run_tool]Chaining   :{}".format(
        config["chain"]))
    job.fileStore.logToMaster("[run_tool]Realign    :{}".format(
        config["realign"]))
    job.fileStore.logToMaster("[run_tool]Caller     :{}".format(
        config["caller"]))
    job.fileStore.logToMaster("[run_tool]Stats      :{}".format(
        config["stats"]))

    job.addFollowOnJobFn(marginAlignJobFunction, config, alignment_fid)
Пример #6
0
def signalAlignCheckInputJobFunction(job, config, sample):
    require(config["ref"], "[signalAlignCheckInputJobFunction]Missing reference URL")
    require(config["ledger_url"], "[signalAlignCheckInputJobFunction]Missing ledger URL")
    require(config["HMM_file"], "[signalAlignCheckInputJobFunction]Missing HMM file URL")
    require(config["HDP_file"], "[signalAlignCheckInputJobFunction]Missing HDP file URL")
    if config["degenerate"]:
        require(checkDegenerate(config["degenerate"]),
                "[signalAlignJobFunction]Degenerate %s not allowed" % config["degenerate"])
    job.addFollowOnJobFn(signalAlignRootJobFunction, config, sample)
Пример #7
0
def kmer_dag(job,
             input_file,
             output_path,
             kmer_length,
             spark_conf,
             workers,
             cores,
             memory,
             sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or
            (workers is not None and cores is not None and memory is not None and spark_conf is not None),
            "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf).")

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job,
                                              sudo,
                                              workers,
                                              cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload,
                      masterHostname,
                      input_file, output_file, kmer_length,
                      spark_conf, memory, sudo)
Пример #8
0
def generate_file(file_path, generate_func):
    """
    Checks file existance, generates file, and provides message
    :param str file_path: File location to generate file
    :param function generate_func: Function used to generate file
    """
    require(not os.path.exists(file_path), file_path + ' already exists!')
    with open(file_path, 'w') as f:
        f.write(generate_func())
    print('\t{} has been generated in the current working directory.'.format(
        os.path.basename(file_path)))
Пример #9
0
def _make_parameters(master_ip, default_parameters, memory, arguments,
                     override_parameters):
    """
    Makes a Spark Submit style job submission line.

    :param masterIP: The Spark leader IP address.
    :param default_parameters: Application specific Spark configuration parameters.
    :param memory: The memory to allocate to each Spark driver and executor.
    :param arguments: Arguments to pass to the submitted job.
    :param override_parameters: Parameters passed by the user, that override our defaults.
    
    :type masterIP: MasterAddress
    :type default_parameters: list of string
    :type arguments: list of string
    :type memory: int or None
    :type override_parameters: list of string or None
    """

    # python doesn't support logical xor?
    # anywho, exactly one of memory or override_parameters must be defined
    require((override_parameters is not None or memory is not None) and (
        override_parameters is None or memory is None
    ), "Either the memory setting must be defined or you must provide Spark configuration parameters."
            )

    # if the user hasn't provided overrides, set our defaults
    parameters = []
    if memory is not None:
        parameters = [
            "--conf",
            "spark.driver.memory=%sg" % memory, "--conf",
            "spark.executor.memory=%sg" % memory
        ]
    else:
        parameters.extend(override_parameters)

    if master_ip:
        parameters.extend([
            "--master",
            "spark://%s:%s" % (master_ip, SPARK_MASTER_PORT), "--conf",
            ("spark.hadoop.fs.default.name=hdfs://%s:%s" %
             (master_ip, HDFS_MASTER_PORT))
        ])

    # add the tool specific spark parameters
    parameters.extend(default_parameters)

    # spark submit expects a '--' to split the spark conf arguments from tool arguments
    parameters.append('--')

    # now add the tool arguments and return
    parameters.extend(arguments)

    return parameters
def test_pipeline_output_with_graphs(tmpdir):
    uuid = "test_rnaseqsc_g"
    output_dir = os.path.join(str(tmpdir), uuid)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    output_file = os.path.join(output_dir, uuid + ".tar.gz")
    jobstore = os.path.join(str(tmpdir), uuid + "_jobstore")

    input = "file://" + _get_test_fastq_files(tmpdir, tarball=True)
    config = _generate_config(tmpdir, output_dir, generate_graphs=True)
    manifest = _generate_manifest(tmpdir, [[uuid, "pseudo", input]])

    subprocess.check_call([
        'toil-rnaseq-sc', 'run', '--config', config, '--manifest', manifest,
        '--maxCores', "1", jobstore
    ])
    # ensure file and directories exist
    require(os.path.isfile(output_file),
            "expected outputfile to exist: " + output_file)
    subprocess.check_call(['tar', '-xvf', output_file, '-C', output_dir])
    require(
        os.path.isfile(os.path.join(output_dir, uuid, "kallisto",
                                    "matrix.tsv")),
        "matrix.tsv file should exist in output tarball")
    require(os.path.isdir(os.path.join(output_dir, uuid, "kallisto", "plots")),
            "plots directory should exist in output tarball")
    require(
        len(os.listdir(os.path.join(output_dir, uuid, "kallisto", "plots"))) >
        0, "plots directory should not be empty in output tarball")
Пример #11
0
def parse_samples(path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if line.isspace() or line.startswith('#'):
                continue
            sample = line.strip().split('\t')
            if len(sample) != 2:
                raise UserError(
                    'Bad manifest format! Expected 2 tab separated columns, got: {}'
                    .format(sample))

            # If a directory is passed in, use all samples in that directory
            uuid, url = sample
            if urlparse(url).scheme == '':
                url = [
                    'file://' + os.path.join(url, x) for x in os.listdir(url)
                ]
            # If url is a tarball
            elif url.endswith('tar.gz') or url.endswith('tar'):
                require(
                    urlparse(url).scheme in SCHEMES,
                    'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES))
                url = [url]
            # If URL is a fastq or series of fastqs
            elif url.endswith('fastq.gz') or url.endswith(
                    'fastq') or url.endswith('fq.gz') or url.endswith('fq'):
                url = url.split(',')
                [
                    require(
                        urlparse(x).scheme in SCHEMES,
                        'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES))
                    for x in url
                ]
            else:
                raise UserError(
                    'URL does not have approved extension: .tar.gz, .tar, .fastq.gz, .fastq, .fq.gz, .fq'
                )

            sample = [uuid, url]
            samples.append(sample)
    return samples
Пример #12
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter trimming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple
    """
    work_dir = job.fileStore.getLocalTempDir()
    if r2_id:
        require(rev_3pr_adapter,
                "Paired end data requires a reverse 3' adapter sequence.")
    # Retrieve files
    parameters = ['-a', fwd_3pr_adapter, '-m', '35']
    if r1_id and r2_id:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
        parameters.extend([
            '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p',
            '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq'
        ])
    else:
        job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])
    # Call: CutAdapt
    docker_call(
        job=job,
        tool=
        'quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2',
        work_dir=work_dir,
        parameters=parameters)
    # Write to fileStore
    if r1_id and r2_id:
        r1_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R2_cutadapt.fastq'))
    else:
        r1_cut_id = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, 'R1_cutadapt.fastq'))
        r2_cut_id = None
    return r1_cut_id, r2_cut_id
def test_quant_to_pseudo(tmpdir):
    # paths resolution based on the assumption that the file is called from the root of the toil-rnaseq-sc directory
    input = os.path.abspath("testdata/input")
    output = os.path.join(str(tmpdir), "output")
    expected = os.path.abspath("testdata/expected")
    os.mkdir(output)
    quant_to_pseudo(job=None, input_dir=input, output_dir=output)
    filenames = os.listdir(expected)
    for file in filenames:
        with open(os.path.join(expected, file)) as expected_file, open(
                os.path.join(output, file)) as output_file:
            expected_read = expected_file.read()
            output_read = output_file.read()
            require(
                expected_read == output_read,
                "expected {} did not match actual {}".format(
                    expected_read, output_read))
Пример #14
0
def parseManifest(path_to_manifest):
    require(os.path.exists(path_to_manifest), "[parseManifest]Didn't find manifest file, looked "
            "{}".format(path_to_manifest))

    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line))
        sample_line = line.strip().split("\t")
        require(len(sample_line) == 3, "[parse_line]Invalid, len(line) != 3, offending {}".format(line))
        url, sample_label, size = sample_line
        # check alignment URL
        require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url))

        return SignalAlignSample(URL=url, size=size, sample_label=sample_label)

    with open(path_to_manifest, "r") as fH:
        return map(parse_line, [x for x in fH if (not x.isspace() and not x.startswith("#"))])
Пример #15
0
def kmer_dag(job, input_file, output_path, kmer_length, spark_conf, workers,
             cores, memory, sudo):
    '''
    Optionally launches a Spark cluster and then runs ADAM to count k-mers on an
    input file.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_path: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, workers should \
    not be set.
    :param workers: Optional number of Spark workers to launch. If set, \
    spark_conf should not be set, and cores and memory should be set.
    :param cores: Number of cores per Spark worker. Must be set if workers is \
    set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if workers is set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_path: string
    :type kmer_length: int or string
    :type spark_conf: string or None
    :type workers: int or None
    :type cores: int or None
    :type memory: int or None
    :type sudo: boolean
    '''

    require((spark_conf is not None and workers is None) or (
        workers is not None and cores is not None and memory is not None
        and spark_conf is not None
    ), "Either worker count (--workers) must be defined or user must pass in Spark configuration (--spark-conf)."
            )

    # if we do not have a spark configuration, then we must spawn a cluster
    if spark_conf is None:
        master_hostname = spawn_spark_cluster(job, sudo, workers, cores)
    else:
        spark_conf = shlex.split(spark_conf)

    job.addChildJobFn(download_count_upload, masterHostname, input_file,
                      output_file, kmer_length, spark_conf, memory, sudo)
Пример #16
0
def makeNanoporeReadLedgerJobFunction(job, tar_fid, batchsize, readstore_dir):
    workdir        = job.fileStore.getLocalTempDir()
    minion_archive = job.fileStore.readGlobalFile(tar_fid)
    tar_handle     = tarfile.open(minion_archive, "r:gz")
    members        = tar_handle.getmembers()
    member_paths   = [os.path.join(workdir, m.name) for m in members]
    tar_handle.extractall(path=workdir)
    require(batchsize <= len(member_paths),
            "[makeNanoporeReadLedgerJobFunction]Cannot split %s members into batches of %s"
            % (len(member_paths), batchsize))

    member_iter   = [member_paths[i:i + batchsize]
                     for i in range(0, len(member_paths), batchsize)]
    tar_fids      = [archiveBatchAndUploadToFileStore(job, b, workdir) for b in member_iter]
    ledger_shards = [job.addChildJobFn(makeNanoporeReadsJobFunction, fid, readstore_dir, cores=0.5).rv()
                     for fid in tar_fids]
    tar_handle.close()
    return job.addFollowOnJobFn(consolidateLedgerShardsJobFunction, ledger_shards).rv()
Пример #17
0
def s3am_upload(fpath, s3_dir, num_cores=1, s3_key_path=None):
    """
    Uploads a file to s3 via S3AM
    S3AM binary must be on the PATH to use this function
    For SSE-C encryption: provide a path to a 32-byte file

    :param str fpath: Path to file to upload
    :param str s3_dir: Ouptut S3 path. Format: s3://bucket/[directory]
    :param int num_cores: Number of cores to use for up/download with S3AM
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    """
    require(s3_dir.startswith('s3://'),
            'Format of s3_dir (s3://) is incorrect: {}'.format(s3_dir))
    s3_dir = os.path.join(s3_dir, os.path.basename(fpath))
    _s3am_with_retry(num_cores,
                     file_path=fpath,
                     s3_url=s3_dir,
                     mode='upload',
                     s3_key_path=s3_key_path)
Пример #18
0
def processReferenceSequence(ref_seq,
                             workdir,
                             motif_key=None,
                             sub_char="X",
                             parent_job=None):
    # make the forward and backward sequences, substituting the necessary motifs
    if motif_key is not None:
        motif, ok = getMotif(motif_key, ref_seq)
        require(
            ok,
            "[processReferenceSequence]Illegal motif_key given %s" % motif_key)
        if parent_job is not None:
            parent_job.fileStore.logToMaster(
                "[processReferenceSequence]Made %s substitutions" %
                motif.substitutionPositionCount())
        try:
            fw_refseq = motif.forwardSubstitutedSequence(sub_char)
            bw_refseq = motif.complementSubstitutedSequence(sub_char)
        except AssertionError:
            return None, None, False
    else:
        fw_refseq = ref_seq.upper()
        bw_refseq = _reverseComplement(fw_refseq,
                                       reverse=False,
                                       complement=True)

    fw_refseqfile = LocalFile(workdir=workdir)
    bw_refseqfile = LocalFile(workdir=workdir)
    sequences = [fw_refseq, bw_refseq]
    sequence_files = [fw_refseqfile, bw_refseqfile]

    for f, s in zip(sequence_files, sequences):
        _h = open(f.fullpathGetter(), "w")
        _h.write(s + "\n")
        _h.close()

    [
        require(os.path.exists(f.fullpathGetter()),
                "[processReferenceSequence]Missing %s" % f.filenameGetter())
        for f in sequence_files
    ]

    return fw_refseqfile, bw_refseqfile, True
def parse_manifest(path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(len(sample) == 3, 'Bad manifest format! '
                                          'Expected 3 tab separated columns, got: {}'.format(sample))
                uuid, normal, tumor = sample
                for url in [normal, tumor]:
                    require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url))
                samples.append(sample)
    return samples
Пример #20
0
def download_sample(job, sample, config):
    """
    Download sample and store unique attributes

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list(str, str, str, str) sample: Sample information: filetype, paired/unpaired, UUID, and URL
    :param Namespace config: Argparse Namespace object containing argument inputs
    """
    # Create copy of config that is sample specific
    config = argparse.Namespace(**vars(config))
    config.file_type, config.paired, config.uuid, config.url = sample
    config.paired = True if config.paired == 'paired' else False
    config.cores = min(config.maxCores, multiprocessing.cpu_count())
    disk = '2G' if config.ci_test else '20G'
    job.fileStore.logToMaster(
        'UUID: {}\nURL: {}\nPaired: {}\nFile Type: {}\nCores: {}\nCIMode: {}'.
        format(config.uuid, config.url, config.paired, config.file_type,
               config.cores, config.ci_test))
    # Download or locate local file and place in the jobStore
    tar_id, fastq_ids = None, None
    if config.file_type == 'tar':
        tar_id = job.addChildJobFn(download_url_job,
                                   config.url,
                                   cghub_key_path=config.gtkey,
                                   s3_key_path=config.ssec,
                                   disk=disk).rv()
    else:
        urls = config.url.split(',')
        if config.paired:
            require(
                len(urls) % 2 == 0,
                'Fastq pairs must have multiples of 2 URLS separated by comma')
        config.gz = True if urls[0].endswith('gz') else None
        for url in urls:
            fastq_ids.append(
                job.addChildJobFn(download_url_job,
                                  url,
                                  cghub_key_path=config.gtkey,
                                  s3_key_path=config.ssec,
                                  disk=disk).rv())
    job.addFollowOnJobFn(preprocessing_declaration, config, tar_id, fastq_ids)
Пример #21
0
def parse_manifest(manifest_path):
    """
    Parse manifest file

    :param str manifest_path: Path to manifest file
    :return: samples
    :rtype: list[str, list]
    """
    samples = []
    with open(manifest_path, 'r') as f:
        for line in f:
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(2 <= len(sample) <= 3, 'Bad manifest format! '
                                               'Expected UUID\tURL1\t[URL2] (tab separated), got: {}'.format(sample))
                uuid = sample[0]
                urls = sample[1:]
                for url in urls:
                    require(urlparse(url).scheme and urlparse(url), 'Invalid URL passed for {}'.format(url))
                samples.append([uuid, urls])
    return samples
Пример #22
0
    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line))
        sample_line = line.strip().split("\t")
        require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line))
        filetype, url, sample_label, size = sample_line
        # checks:
        # check filetype
        require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype))
        # check URL
        require(urlparse(url).scheme and urlparse(url),
                "Invalid URL passed for {}".format(url))

        return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label)
Пример #23
0
def parseManifestReadstore(path_to_manifest):
    require(os.path.exists(path_to_manifest), "[parseManifest]Didn't find manifest file, looked "
            "{}".format(path_to_manifest))
    allowed_file_types = ("tar", "gz-tar")

    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line))
        sample_line = line.strip().split("\t")
        require(len(sample_line) == 4, "[parse_line]Invalid, len(line) != 4, offending {}".format(line))
        filetype, url, sample_label, size = sample_line
        # checks:
        # check filetype
        require(filetype in allowed_file_types, "[parse_line]Unrecognized file type {}".format(filetype))
        # check URL
        require(urlparse(url).scheme and urlparse(url),
                "Invalid URL passed for {}".format(url))

        return ReadstoreSample(file_type=filetype, URL=url, size=human2bytes(size), sample_label=sample_label)

    with open(path_to_manifest, "r") as fH:
        return map(parse_line, [x for x in fH if (not x.isspace() and not x.startswith("#"))])
Пример #24
0
def parse_manifest(path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if not line.isspace() and not line.startswith('#'):
                sample = line.strip().split('\t')
                require(
                    len(sample) == 3, 'Bad manifest format! '
                    'Expected 3 tab separated columns, got: {}'.format(sample))
                uuid, normal, tumor = sample
                for url in [normal, tumor]:
                    require(
                        urlparse(url).scheme and urlparse(url),
                        'Invalid URL passed for {}'.format(url))
                samples.append(sample)
    return samples
Пример #25
0
 def _SignalMachine(read_label, cigar, nanopore_read):
     guide_aln = LocalFile(workdir=workdir)
     _handle = open(guide_aln.fullpathGetter(), "w")
     _handle.write(cigar)
     _handle.close()
     require(os.path.exists(guide_aln.fullpathGetter()),
             "NO guide aln file")
     signalMachine_args = [
         "--sm3Hdp",
         "-s",
         "1",
         "-o",
         "%s" % degenerate_enum,
         "-L",
         "%s" % read_label,
         "-T",
         "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)),
         "-q",
         "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()),
         "-f",
         "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()),
         "-b",
         "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()),
         "-p",
         "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()),
         "-u",
         "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()),
         "-v",
         "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)),
     ]
     try:
         docker_call(job=job,
                     tool=signalMachine_image,
                     parameters=signalMachine_args,
                     work_dir=(workdir + "/"))
     except subprocess.CalledProcessError:
         pass
Пример #26
0
def makeNanoporeReadsJobFunction(job, tar_fid, readstore_dir):
    def makeNanoporeRead(f5_path):
        # here we load the NanoporeRead and write it to a file
        np = NanoporeRead(fast_five_file=f5_path, twoD=False)  # make this a config arg
        ok = np.Initialize(job)
        if not ok:
            return None
        _l = np.read_label
        tF = job.fileStore.getLocalTempFile()
        fH = open(tF, "w")
        ok = np.Write(job, fH, initialize=False)
        if not ok:
            fH.close()
            return None
        fH.close()
        # then we gzip it and deliver it to the readstore and return the ledger line
        fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l)
        fH = open(tF, "rb")
        gz = gzip.open(fn.fullpathGetter(), "wb")
        shutil.copyfileobj(fH, gz)
        fH.close()
        gz.close()
        try:
            deliverOutput(job, fn, readstore_dir)
        except RuntimeError:
            job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l)
            return None
        return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))

    def write_ledger_line(line, fH):
        l = "%s\t%s" % (line[0], line[1])  # read_label, npread URL
        fH.write(l)

    workdir    = job.fileStore.getLocalTempDir()
    tar        = job.fileStore.readGlobalFile(tar_fid)
    tar_handle = tarfile.open(tar, "r:gz")
    members    = tar_handle.getmembers()
    members    = [os.path.join(workdir, m.name) for m in members]
    tar_handle.extractall(path=workdir)
    [require(os.path.exists(m), "[makeNanoporeReadsJobFunction]Missing member %s" % m) for m in members]
    ledger_lines = list(map(makeNanoporeRead, members))
    tar_handle.close()

    ledger_shard = job.fileStore.getLocalTempFile()
    _handle      = open(ledger_shard, "w")
    [write_ledger_line(l, _handle) for l in ledger_lines if l is not None]
    _handle.close()

    return job.fileStore.writeGlobalFile(ledger_shard)
def getFastqFromBam(job,
                    bam_sample,
                    samtools_image="quay.io/ucsc_cgl/samtools"):
    # n.b. this is NOT a jobFunctionWrappingJob, it just takes the parent job as
    # an argument to have access to the job store
    # download the BAM to the local directory, use a uid to aviod conflicts
    uid = uuid.uuid4().hex
    work_dir = job.fileStore.getLocalTempDir()
    local_bam = LocalFile(workdir=work_dir, filename="bam_{}.bam".format(uid))
    fastq_reads = LocalFile(workdir=work_dir,
                            filename="fastq_reads{}.fq".format(uid))

    urlDownload(parent_job=job,
                source_url=bam_sample.URL,
                destination_file=local_bam)

    require(not os.path.exists(fastq_reads.fullpathGetter()),
            "[getFastqFromBam]fastq file already exists")

    # run samtools to get the reads from the BAM
    # TODO use DOCKER_DIR and clean this up. idea: make globls.py or something
    samtools_parameters = [
        "fastq", "/data/{}".format(local_bam.filenameGetter())
    ]
    with open(fastq_reads.fullpathGetter(), 'w') as fH:
        docker_call(job=job,
                    tool=samtools_image,
                    parameters=samtools_parameters,
                    work_dir=work_dir,
                    outfile=fH)

    require(os.path.exists(fastq_reads.fullpathGetter()),
            "[getFastqFromBam]didn't generate reads")

    # upload fastq to fileStore
    return job.fileStore.writeGlobalFile(fastq_reads.fullpathGetter())
Пример #28
0
    def parse_line(line):
        # double check input, shouldn't need to though
        require(not line.isspace() and not line.startswith("#"), "[parse_line]Invalid {}".format(line))
        sample_line = line.strip().split("\t")
        require(len(sample_line) == 3, "[parse_line]Invalid, len(line) != 3, offending {}".format(line))
        url, sample_label, size = sample_line
        # check alignment URL
        require(urlparse(url).scheme and urlparse(url), "Invalid URL passed for {}".format(url))

        return SignalAlignSample(URL=url, size=size, sample_label=sample_label)
 def _get_mount_path(self):
     """
     Returns the path of the mount point of the current container. If this method is invoked
     outside of a Docker container a NotInsideContainerError is raised. Likewise if the docker
     daemon is unreachable from inside the container a UserError is raised. This method is
     idempotent.
     """
     if self._mount_path is None:
         name = current_docker_container_id()
         if dockerd_is_reachable():
             # Get name of mounted volume
             blob = json.loads(
                 subprocess.check_output(['docker', 'inspect', name]))
             mounts = blob[0]['Mounts']
             # Ensure docker.sock is mounted correctly
             sock_mnt = [
                 x['Source'] == x['Destination'] for x in mounts
                 if 'docker.sock' in x['Source']
             ]
             require(
                 len(sock_mnt) == 1,
                 'Missing socket mount. Requires the following: '
                 'docker run -v /var/run/docker.sock:/var/run/docker.sock')
             # Ensure formatting of command for 2 mount points
             if len(mounts) == 2:
                 require(
                     all(x['Source'] == x['Destination'] for x in mounts),
                     'Docker Src/Dst mount points, invoked with the -v argument, '
                     'must be the same if only using one mount point aside from the docker '
                     'socket.')
                 work_mount = [
                     x['Source'] for x in mounts
                     if 'docker.sock' not in x['Source']
                 ]
             else:
                 # Ensure only one mirror mount exists aside from docker.sock
                 mirror_mounts = [
                     x['Source'] for x in mounts
                     if x['Source'] == x['Destination']
                 ]
                 work_mount = [
                     x for x in mirror_mounts if 'docker.sock' not in x
                 ]
                 require(
                     len(work_mount) == 1,
                     'Wrong number of mirror mounts provided, see '
                     'documentation.')
             self._mount_path = work_mount[0]
             log.info('The work mount is: %s', self._mount_path)
         else:
             raise UserError(
                 'Docker daemon is not reachable, ensure Docker is being run with: '
                 '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument.'
             )
     return self._mount_path
def main():

    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the ADAM preprocessing pipeline')
    parser_run.add_argument('--config', default='adam_preprocessing.config', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--sample', help='The S3 URL or local path to the input SAM or BAM file.'
                            'NOTE: unlike other pipelines, we do not support ftp://, gnos://, etc. schemes.')
    parser_run.add_argument('--output-dir', required=True, default=None,
                            help='full path where final results will be output')
    parser_run.add_argument('-s', '--suffix', default='',
                            help='Additional suffix to add to the names of the output files')

    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    cwd = os.getcwd()
    if args.command == 'generate-config':
        generate_file(os.path.join(cwd, 'adam-preprocessing.config'), generate_config)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             'generate-config'.format(args.config))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)

        require(not (inputs.master_ip and inputs.num_nodes),
            'Only one of master_ip and num_nodes can be provided.')

        if not hasattr(inputs, 'master_ip'):
            require(inputs.num_nodes > 1,
                'num_nodes allocates one Spark/HDFS master and n-1 workers, and '
                'thus must be greater than 1. %d was passed.' % inputs.num_nodes)

        for arg in [inputs.dbsnp, inputs.memory]:
            require(arg, 'Required argument {} missing from config'.format(arg))

        Job.Runner.startToil(Job.wrapJobFn(static_adam_preprocessing_dag, inputs,
                                           args.sample, args.output_dir), args)
Пример #31
0
def parse_samples(path_to_manifest=None, sample_urls=None):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :param list[str] sample_urls: Sample URLs
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    if sample_urls:
        for url in sample_urls:
            samples.append(
                ['tar', 'paired',
                 os.path.basename(url.split('.')[0]), url])
    elif path_to_manifest:
        with open(path_to_manifest, 'r') as f:
            for line in f.readlines():
                if not line.isspace() and not line.startswith('#'):
                    sample = line.strip().split('\t')
                    require(
                        len(sample) == 4, 'Bad manifest format! '
                        'Expected 4 tab separated columns, got: {}'.format(
                            sample))
                    file_type, paired, uuid, url = sample
                    require(
                        file_type == 'tar' or file_type == 'fq',
                        '1st column must be "tar" or "fq": {}'.format(
                            sample[0]))
                    require(
                        paired == 'paired' or paired == 'single',
                        '2nd column must be "paired" or "single": {}'.format(
                            sample[1]))
                    if file_type == 'fq' and paired == 'paired':
                        require(
                            len(url.split(',')) == 2,
                            'Fastq pair requires two URLs separated'
                            ' by a comma: {}'.format(url))
                    samples.append(sample)
    return samples
Пример #32
0
def parse_manifest(path_to_manifest):
    """
    Parses manifest file for Toil Germline Pipeline

    :param str path_to_manifest: Path to sample manifest file
    :return: List of GermlineSample namedtuples
    :rtype: list[GermlineSample]
    """
    bam_re = r"^(?P<uuid>\S+)\s(?P<url>\S+[bsc][r]?am)"
    fq_re = r"^(?P<uuid>\S+)\s(?P<url>\S+)\s(?P<paired_url>\S+)?\s?(?P<rg_line>@RG\S+)"
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            if line.startswith('#'):
                continue
            bam_match = re.match(bam_re, line)
            fastq_match = re.match(fq_re, line)
            if bam_match:
                uuid = bam_match.group('uuid')
                url = bam_match.group('url')
                paired_url = None
                rg_line = None
                require('.bam' in url.lower(),
                        'Expected .bam extension:\n{}:\t{}'.format(uuid, url))
            elif fastq_match:
                uuid = fastq_match.group('uuid')
                url = fastq_match.group('url')
                paired_url = fastq_match.group('paired_url')
                rg_line = fastq_match.group('rg_line')
                require('.fq' in url.lower() or '.fastq' in url.lower(),
                        'Expected .fq extension:\n{}:\t{}'.format(uuid, url))
            else:
                raise ValueError('Could not parse entry in manifest: %s\n%s' %
                                 (f.name, line))
            # Checks that URL has a scheme
            require(
                urlparse(url).scheme, 'Invalid URL passed for {}'.format(url))
            samples.append(GermlineSample(uuid, url, paired_url, rg_line))
    return samples
Пример #33
0
def parse_manifest(path_to_manifest):
    """
    Parses manifest file for Toil Germline Pipeline

    :param str path_to_manifest: Path to sample manifest file
    :return: List of GermlineSample namedtuples
    :rtype: list[GermlineSample]
    """
    bam_re = r"^(?P<uuid>\S+)\s(?P<url>\S+[bsc][r]?am)"
    fq_re = r"^(?P<uuid>\S+)\s(?P<url>\S+)\s(?P<paired_url>\S+)?\s?(?P<rg_line>@RG\S+)"
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            if line.startswith('#'):
                continue
            bam_match = re.match(bam_re, line)
            fastq_match = re.match(fq_re, line)
            if bam_match:
                uuid = bam_match.group('uuid')
                url = bam_match.group('url')
                paired_url = None
                rg_line = None
                require('.bam' in url.lower(),
                        'Expected .bam extension:\n{}:\t{}'.format(uuid, url))
            elif fastq_match:
                uuid = fastq_match.group('uuid')
                url = fastq_match.group('url')
                paired_url = fastq_match.group('paired_url')
                rg_line = fastq_match.group('rg_line')
                require('.fq' in url.lower() or '.fastq' in url.lower(),
                        'Expected .fq extension:\n{}:\t{}'.format(uuid, url))
            else:
                raise ValueError('Could not parse entry in manifest: %s\n%s' % (f.name, line))
            # Checks that URL has a scheme
            require(urlparse(url).scheme, 'Invalid URL passed for {}'.format(url))
            samples.append(GermlineSample(uuid, url, paired_url, rg_line))
    return samples
Пример #34
0
def main():
    """toil-signalAlign master script
    """
    def parse_args():
        parser = argparse.ArgumentParser(description=print_help.__doc__,
                                         formatter_class=argparse.RawTextHelpFormatter)
        subparsers = parser.add_subparsers(dest="command")

        # parsers for running the full pipeline
        run_parser = subparsers.add_parser("run", help="runs full workflow on a BAM")
        run_parser.add_argument('--config', default='config-toil-signalAlign.yaml', type=str,
                                help='Path to the (filled in) config file, generated with "generate".')
        run_parser.add_argument('--manifest', default='manifest-toil-signalAlign.tsv', type=str,
                                help='Path to the (filled in) manifest file, generated with "generate". '
                                     '\nDefault value: "%(default)s".')
        subparsers.add_parser("generate", help="generates a config file for your run, do this first")

        # parsers for running the readstore pipeline
        readstore_parser = subparsers.add_parser("run-readstore",
                                                 help="generates a readstore from a tar of .fast5s")
        readstore_parser.add_argument('--config', default='config-toil-signalAlign-readstore.yaml', type=str,
                                      help='Path to the (filled in) config file, generated with "generate".')
        readstore_parser.add_argument('--manifest', default='manifest-toil-signalAlign-readstore.tsv', type=str,
                                      help='Path to the (filled in) manifest file, generated with "generate". '
                                      '\nDefault value: "%(default)s".')
        subparsers.add_parser("generate-readstore", help="generates a config file for making a readstore")

        Job.Runner.addToilOptions(run_parser)
        Job.Runner.addToilOptions(readstore_parser)

        return parser.parse_args()

    def exitBadInput(message=None):
        if message is not None:
            print(message, file=sys.stderr)
        sys.exit(1)

    if len(sys.argv) == 1:
        exitBadInput(print_help())

    cwd = os.getcwd()

    args = parse_args()

    if args.command == "generate" or args.command == "generate-readstore":
        if args.command == "generate":
            config_filename   = "config-toil-signalAlign.yaml"
            manifest_filename = "manifest-toil-signalAlign.tsv"
        else:
            config_filename   = "config-toil-signalAlign-readstore.yaml"
            manifest_filename = "manifest-toil-signalAlign-readstore.tsv"

        configGenerator   = partial(generateConfig, command=args.command)
        manifestGenerator = partial(generateManifest, command=args.command)

        try:
            config_path = os.path.join(cwd, config_filename)
            generate_file(config_path, configGenerator)
        except UserError:
            print("[toil-nanopore]NOTICE using existing config file {}".format(config_path))
            pass
        try:
            manifest_path = os.path.join(cwd, manifest_filename)
            generate_file(manifest_path, manifestGenerator)
        except UserError:
            print("[toil-nanopore]NOTICE using existing manifest {}".format(manifest_path))

    elif args.command == "run":
        require(os.path.exists(args.config), "{config} not found run generate".format(config=args.config))
        # Parse config
        config  = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        samples = parseManifest(args.manifest)
        for sample in samples:
            with Toil(args) as toil:
                if not toil.options.restart:
                    root_job = Job.wrapJobFn(signalAlignCheckInputJobFunction, config, sample)
                    return toil.start(root_job)
                else:
                    toil.restart()
    elif args.command == "run-readstore":
        require(os.path.exists(args.config), "{config} not found run generate-readstore".format(config=args.config))
        # Parse config
        config  = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        samples = parseManifestReadstore(args.manifest)
        with Toil(args) as toil:
            if not toil.options.restart:
                root_job = Job.wrapJobFn(makeReadstoreJobFunction, config, samples)
                return toil.start(root_job)
            else:
                toil.restart()
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil exome pipeline

    Perform variant / indel analysis given a pair of tumor/normal BAM files.
    Samples are optionally preprocessed (indel realignment and base quality score recalibration)
    The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel.

    General usage:
    1. Type "toil-exome generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-exome run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline

    Structure of variant pipeline (per sample)

           1 2 3 4          14 -------
           | | | |          |        |
        0 --------- 5 ----- 15 -------- 17
                    |       |        |
                   ---      16 -------
                   | |
                   6 7
                   | |
                   8 9
                   | |
                  10 11
                   | |
                  12 13

    0 = Start node
    1 = reference index
    2 = reference dict
    3 = normal bam index
    4 = tumor bam index
    5 = pre-processing node / DAG declaration
    6,7 = RealignerTargetCreator
    8,9 = IndelRealigner
    10,11 = BaseRecalibration
    12,13 = PrintReads
    14 = MuTect
    15 = Pindel
    16 = MuSe
    17 = Consolidate Output and move/upload results
    ==================================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline')
    parser_run.add_argument('--config', default='config-toil-exome.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--manifest', default='manifest-toil-exome.tsv', type=str,
                            help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--normal', default=None, type=str,
                            help='URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--tumor', default=None, type=str,
                            help='URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the'
                                                                   '"--tumor" and "--normal" option')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if args.normal or args.tumor or args.uuid:
            require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied')
            samples = [[args.uuid, args.normal, args.tumor]]
        else:
            samples = parse_manifest(args.manifest)
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        # Exome pipeline sanity checks
        if config.preprocessing:
            require(config.reference and config.phase and config.mills and config.dbsnp,
                    'Missing inputs for preprocessing, check config file.')
        if config.run_mutect:
            require(config.reference and config.dbsnp and config.cosmic,
                    'Missing inputs for MuTect, check config file.')
        if config.run_pindel:
            require(config.reference, 'Missing input (reference) for Pindel.')
        if config.run_muse:
            require(config.reference and config.dbsnp,
                    'Missing inputs for MuSe, check config file.')
        require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program), None), program + ' must be installed on every node.'.format(program))

        # Launch Pipeline
        Job.Runner.startToil(Job.wrapJobFn(download_shared_files, samples, config), args)
Пример #36
0
def setup_and_run_bwakit(job, uuid, url, rg_line, config, paired_url=None):
    """
    Downloads and runs bwakit for BAM or FASTQ files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str uuid: Unique sample identifier
    :param str url: FASTQ or BAM file URL. BAM alignment URL must have .bam extension.
    :param Namespace config: Input parameters and shared FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.cores                Number of cores for each job
        config.trim                 If True, trim adapters using bwakit
        config.amb                  FileStoreID for BWA index file prefix.amb
        config.ann                  FileStoreID for BWA index file prefix.ann
        config.bwt                  FileStoreID for BWA index file prefix.bwt
        config.pac                  FileStoreID for BWA index file prefix.pac
        config.sa                   FileStoreID for BWA index file prefix.sa
        config.alt                  FileStoreID for alternate contigs file or None
    :param str|None paired_url: URL to paired FASTQ
    :param str|None rg_line: Read group line (i.e. @RG\tID:foo\tSM:bar)
    :return: BAM FileStoreID
    :rtype: str
    """
    bwa_config = deepcopy(config)
    bwa_config.uuid = uuid
    bwa_config.rg_line = rg_line

    # bwa_alignment uses a different naming convention
    bwa_config.ref = config.genome_fasta
    bwa_config.fai = config.genome_fai

    # Determine if sample is a FASTQ or BAM file using the file extension
    basename, ext = os.path.splitext(url)
    ext = ext.lower()
    if ext == '.gz':
        _, ext = os.path.splitext(basename)
        ext = ext.lower()

    # The pipeline currently supports FASTQ and BAM files
    require(ext in ['.fq', '.fastq', '.bam'],
            'Please use .fq or .bam file extensions:\n%s' % url)

    # Download fastq files
    samples = []
    input1 = job.addChildJobFn(download_url_job,
                               url,
                               name='file1',
                               s3_key_path=config.ssec,
                               disk=config.file_size)

    samples.append(input1.rv())

    # If the extension is for a BAM  file, then configure bwakit to realign the BAM file.
    if ext == '.bam':
        bwa_config.bam = input1.rv()
    else:
        bwa_config.r1 = input1.rv()

    # Download the paired FASTQ URL
    if paired_url:
        input2 = job.addChildJobFn(download_url_job,
                                   paired_url,
                                   name='file2',
                                   s3_key_path=config.ssec,
                                   disk=config.file_size)
        samples.append(input2.rv())
        bwa_config.r2 = input2.rv()

    # The bwakit disk requirement depends on the size of the input files and the index
    # Take the sum of the input files and scale it by a factor of 4
    bwa_index_size = sum([getattr(config, index_file).size
                          for index_file in ['amb', 'ann', 'bwt', 'pac', 'sa', 'alt']
                          if getattr(config, index_file, None) is not None])

    bwakit_disk = PromisedRequirement(lambda lst, index_size:
                                      int(4 * sum(x.size for x in lst) + index_size),
                                      samples,
                                      bwa_index_size)

    return job.addFollowOnJobFn(run_bwakit,
                                bwa_config,
                                sort=False,         # BAM files are sorted later in the pipeline
                                trim=config.trim,
                                cores=config.cores,
                                disk=bwakit_disk).rv()
Пример #37
0
def gatk_germline_pipeline(job, samples, config):
    """
    Runs the GATK best practices pipeline for germline SNP and INDEL discovery.

    Steps in Pipeline
    0: Generate and preprocess BAM
        - Uploads processed BAM to output directory
    1: Call Variants using HaplotypeCaller
        - Uploads GVCF
    2: Genotype VCF
        - Uploads VCF
    3: Filter Variants using either "hard filters" or VQSR
        - Uploads filtered VCF

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list[GermlineSample] samples: List of GermlineSample namedtuples
    :param Namespace config: Input parameters and reference FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.genome_dict          FilesStoreID for reference genome sequence dictionary file
        config.cores                Number of cores for each job
        config.xmx                  Java heap size in bytes
        config.suffix               Suffix added to output filename
        config.output_dir           URL or local path to output directory
        config.ssec                 Path to key file for SSE-C encryption
        config.joint_genotype       If True, then joint genotype and filter cohort
        config.hc_output            URL or local path to HaplotypeCaller output for testing
    :return: Dictionary of filtered VCF FileStoreIDs
    :rtype: dict
    """
    require(len(samples) > 0, 'No samples were provided!')

    # Get total size of genome reference files. This is used for configuring disk size.
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # 0: Generate processed BAM and BAI files for each sample
    # group preprocessing and variant calling steps in empty Job instance
    group_bam_jobs = Job()
    gvcfs = {}
    for sample in samples:
        # 0: Generate processed BAM and BAI files for each sample
        get_bam = group_bam_jobs.addChildJobFn(prepare_bam,
                                               sample.uuid,
                                               sample.url,
                                               config,
                                               paired_url=sample.paired_url,
                                               rg_line=sample.rg_line)

        # 1: Generate per sample gvcfs {uuid: gvcf_id}
        # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference
        # files, and the output GVCF file. The output GVCF is smaller than the input BAM file.
        hc_disk = PromisedRequirement(lambda bam, bai, ref_size:
                                      2 * bam.size + bai.size + ref_size,
                                      get_bam.rv(0),
                                      get_bam.rv(1),
                                      genome_ref_size)

        get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller,
                                            get_bam.rv(0),
                                            get_bam.rv(1),
                                            config.genome_fasta, config.genome_fai, config.genome_dict,
                                            annotations=config.annotations,
                                            cores=config.cores,
                                            disk=hc_disk,
                                            memory=config.xmx,
                                            hc_output=config.hc_output)
        # Store cohort GVCFs in dictionary
        gvcfs[sample.uuid] = get_gvcf.rv()

        # Upload individual sample GVCF before genotyping to a sample specific output directory
        vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix)
        get_gvcf.addChildJobFn(output_file_job,
                               vqsr_name,
                               get_gvcf.rv(),
                               os.path.join(config.output_dir, sample.uuid),
                               s3_key_path=config.ssec,
                               disk=PromisedRequirement(lambda x: x.size, get_gvcf.rv()))

    # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of
    # 30 exomes or one large WGS sample:
    # https://software.broadinstitute.org/gatk/documentation/article?id=3225

    filtered_vcfs = {}
    if config.joint_genotype:
        # Need to configure joint genotype in a separate function to resolve promises
        filtered_vcfs = group_bam_jobs.addFollowOnJobFn(joint_genotype_and_filter,
                                                        gvcfs,
                                                        config).rv()

    # If not joint genotyping, then iterate over cohort and genotype and filter individually.
    else:
        for uuid, gvcf_id in gvcfs.iteritems():
            filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn(genotype_and_filter,
                                                                  {uuid: gvcf_id},
                                                                  config).rv()

    job.addChild(group_bam_jobs)
    return filtered_vcfs
Пример #38
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    Toil BWA pipeline

    Alignment of fastq reads via BWA-kit

    General usage:
    1. Type "toil-bwa generate" to create an editable manifest and config in the current working directory.
    2. Parameterize the pipeline by editing the config.
    3. Fill in the manifest with information pertaining to your samples.
    4. Type "toil-bwa run [jobStore]" to execute the pipeline.

    Please read the README.md located in the source directory or at:
    https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/bwa_alignment

    Structure of the BWA pipeline (per sample)

        0 --> 1

    0 = Download sample
    1 = Run BWA-kit
    ===================================================================
    :Dependencies:
    cURL:       apt-get install curl
    Toil:       pip install toil
    Docker:     wget -qO- https://get.docker.com/ | sh

    Optional:
    S3AM:       pip install --s3am (requires ~/.boto config file)
    Boto:       pip install boto
    """
    # Define Parser object and add to Toil
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    # Generate subparsers
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the BWA alignment pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument('--config', default='config-toil-bwa.yaml', type=str,
                            help='Path to the (filled in) config file, generated with "generate-config".')
    group.add_argument('--manifest', default='manifest-toil-bwa.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s".')
    group.add_argument('--sample', nargs='+', action=required_length(2, 3),
                       help='Space delimited sample UUID and fastq files in the format: uuid url1 [url2].')
    # Print docstring help if no arguments provided
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-bwa.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-bwa.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run generate-config'.format(args.config))
        if not args.sample:
            args.sample = None
            require(os.path.exists(args.manifest), '{} not found and no sample provided. '
                                                   'Please run "generate-manifest"'.format(args.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint
        samples = [args.sample[0], args.sample[1:]] if args.sample else parse_manifest(args.manifest)
        # Sanity checks
        require(config.ref, 'Missing URL for reference file: {}'.format(config.ref))
        require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
        # Launch Pipeline
        Job.Runner.startToil(Job.wrapJobFn(download_reference_files, config, samples), args)
Пример #39
0
def archiveBatchAndUploadToFileStore(parent_job, batch, workdir):
    tarname = "%s.tmp" % uuid.uuid4().hex
    tarpath = os.path.join(workdir, tarname)
    tarball_files(tar_name=tarname, file_paths=batch, output_dir=workdir)
    require(os.path.exists(tarpath), "[archiveBatchAndUploadToFileStore]Didn't make smaller tar")
    return parent_job.fileStore.writeGlobalFile(tarpath)
Пример #40
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    MarginPhase pipeline

    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """

    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')

    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the MarginPhase pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default=DEFAULT_CONFIG_NAME,
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default=DEFAULT_MANIFEST_NAME,
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')

    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME),
                      generate_manifest)

    # Pipeline execution
    elif args.command == 'run':
        # sanity check
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-marginphase generate-config"'.format(args.config))
        require(
            os.path.exists(args.manifest),
            '{} not found and no samples provided. Please '
            'run "toil-marginphase generate-manifest"'.format(args.manifest))

        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize
        config.defaultCores = int(min(MP_CPU, config.maxCores))
        config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint
        config.maxMemory = sys.maxint
        # fix parsing of GB to int
        if args.maxMemory:
            args.maxMemory = args.maxMemory.upper()
            if args.maxMemory.endswith('B'):
                args.maxMemory = args.maxMemory.rstrip('B')
            # actual parsing
            if args.maxMemory.endswith('G'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024
            elif args.maxMemory.endswith('M'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('M')) * 1024 * 1024
            elif args.maxMemory.endswith('K'):
                config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024
            else:
                config.maxMemory = int(args.maxMemory)

        # Config sanity checks
        require(config.output_dir, 'No output location specified')
        if urlparse(config.output_dir).scheme != "s3":
            config.output_dir = config.output_dir.replace("file://", "", 1)
            mkdir_p(config.output_dir)
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        require(config.partition_size,
                "Configuration parameter partition-size is required")
        require(config.partition_margin,
                "Configuration parameter partition-margin is required")

        if 'save_intermediate_files' not in config or not config.save_intermediate_files:
            config.intermediate_file_location = None
        elif urlparse(config.output_dir).scheme == "s3":
            raise UserError(
                "Config parameter 'save_intermediate_files' cannot be used with s3 output directory"
            )
        else:
            intermediate_location = os.path.join(
                config.output_dir, "intermediate",
                datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
            mkdir_p(intermediate_location)
            config.intermediate_file_location = intermediate_location
        if "margin_phase_image" not in config or len(
                config.margin_phase_image) == 0:
            config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT
        if "margin_phase_tag" not in config or len(
                config.margin_phase_tag) == 0:
            config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT
        if "cpecan_image" not in config or len(config.cpecan_image) == 0:
            config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT
        if "cpecan_tag" not in config or len(config.cpecan_tag) == 0:
            config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT
        if "unittest" not in config:
            config.unittest = False
        if "minimal_output" not in config:
            config.minimal_output = False
        if "minimal_cpecan_output" not in config:
            config.minimal_cpecan_output = False
        if "cpecan_probabilities" not in config:
            config.cpecan_probabilities = False

        # get samples
        samples = parse_samples(config, args.manifest)

        # Program checks
        for program in ['docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow
        Job.Runner.startToil(
            Job.wrapJobFn(map_job, prepare_input, samples, config), args)
Пример #41
0
def main():
    """
    GATK germline pipeline with variant filtering and annotation.
    """
    # Define Parser object and add to jobTree
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)

    # Generate subparsers
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser('generate-config',
                          help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest',
                          help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate',
                          help='Generates a config and manifest in the current working directory.')

    # Run subparser
    parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline')
    parser_run.add_argument('--config',
                            required=True,
                            type=str,
                            help='Path to the (filled in) config file, generated with '
                                 '"generate-config".')
    parser_run.add_argument('--manifest',
                            type=str,
                            help='Path to the (filled in) manifest file, generated with '
                                 '"generate-manifest".\nDefault value: "%(default)s".')
    parser_run.add_argument('--sample',
                            default=None,
                            nargs=2,
                            type=str,
                            help='Input sample identifier and BAM file URL or local path')
    parser_run.add_argument('--output-dir',
                            default=None,
                            help='Path/URL to output directory')
    parser_run.add_argument('-s', '--suffix',
                            default=None,
                            help='Additional suffix to add to the names of the output files')
    parser_run.add_argument('--preprocess-only',
                            action='store_true',
                            help='Only runs preprocessing steps')

    Job.Runner.addToilOptions(parser_run)
    options = parser.parse_args()

    cwd = os.getcwd()
    if options.command == 'generate-config' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config)
    if options.command == 'generate-manifest' or options.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest)
    elif options.command == 'run':
        # Program checks
        for program in ['curl', 'docker']:
            require(next(which(program)),
                    program + ' must be installed on every node.'.format(program))

        require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config))

        # Read sample manifest
        samples = []
        if options.manifest:
            samples.extend(parse_manifest(options.manifest))

        # Add BAM sample from command line
        if options.sample:
            uuid, url = options.sample
            # samples tuple: (uuid, url, paired_url, rg_line)
            # BAM samples should not have as paired URL or read group line
            samples.append(GermlineSample(uuid, url, None, None))

        require(len(samples) > 0,
                'No samples were detected in the manifest or on the command line')

        # Parse inputs
        inputs = {x.replace('-', '_'): y for x, y in
                  yaml.load(open(options.config).read()).iteritems()}

        required_fields = {'genome_fasta',
                           'output_dir',
                           'run_bwa',
                           'sorted',
                           'snp_filter_annotations',
                           'indel_filter_annotations',
                           'preprocess',
                           'preprocess_only',
                           'run_vqsr',
                           'joint_genotype',
                           'run_oncotator',
                           'cores',
                           'file_size',
                           'xmx',
                           'suffix'}

        input_fields = set(inputs.keys())
        require(input_fields > required_fields,
                'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields)))

        if inputs['output_dir'] is None:
            inputs['output_dir'] = options.output_dir

        require(inputs['output_dir'] is not None,
                'Missing output directory PATH/URL')

        if inputs['suffix'] is None:
            inputs['suffix'] = options.suffix if options.suffix else ''

        if inputs['preprocess_only'] is None:
            inputs['preprocess_only'] = options.preprocess_only

        if inputs['run_vqsr']:
            # Check that essential VQSR parameters are present
            vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'}
            require(input_fields > vqsr_fields,
                    'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields)))

        # Check that hard filtering parameters are present. If only running preprocessing steps, then we do
        # not need filtering information.
        elif not inputs['preprocess_only']:
            hard_filter_fields = {'snp_filter_name', 'snp_filter_expression',
                                  'indel_filter_name', 'indel_filter_expression'}
            require(input_fields > hard_filter_fields,
                    'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields)))

            # Check for falsey hard filtering parameters
            for hard_filter_field in hard_filter_fields:
                require(inputs[hard_filter_field], 'Missing %s value for hard filtering, '
                                                   'got %s.' % (hard_filter_field, inputs[hard_filter_field]))

        # Set resource parameters
        inputs['xmx'] = human2bytes(inputs['xmx'])
        inputs['file_size'] = human2bytes(inputs['file_size'])
        inputs['cores'] = int(inputs['cores'])

        inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations'])

        # HaplotypeCaller test data for testing
        inputs['hc_output'] = inputs.get('hc_output', None)

        # It is a toil-scripts convention to store input parameters in a Namespace object
        config = argparse.Namespace(**inputs)

        root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config)
        Job.Runner.startToil(root, options)
def generate_file(file_path, generate_func):
    require(not os.path.exists(file_path), file_path + ' already exists!')
    with open(file_path, 'w') as f:
        f.write(generate_func())
    print('\t{} has been generated in the current working directory.'.format(os.path.basename(file_path)))