示例#1
0
def platypus_single(job, config, name, samples, input_bam):
    """Run Platypus on an an unmatched tumour sample and call somatic variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    platypus_vcf = "{}.platypus.vcf".format(name)
    platypus_log = "{}.platypus.log".format(name)
    internal_log = "{}.platypus_internal.log".format(name)

    platypus_command = ["{}".format(config['platypus']['bin']),
                        "callVariants",
                        "--refFile={}".format(config['reference']),
                        "--regions={}".format(samples[name]['regions']),
                        "--assemble=1",
                        "--assembleBadReads=1",
                        "--assembleBrokenPairs=1",
                        "--filterDuplicates=0",
                        "--minVarFreq={}".format(config['min_alt_af']),
                        "--nCPU={}".format(config['platypus']['num_cores']),
                        "--logFileName={}".format(internal_log),
                        "--bamFiles={}".format(input_bam),
                        "--output={}".format(platypus_vcf)]

    job.fileStore.logToMaster("Platypus Command: {}\n".format(platypus_command))
    pipeline.run_and_log_command(" ".join(platypus_command), platypus_log)

    return platypus_vcf
示例#2
0
def scalpel_single(job, config, name, samples, input_bam):
    """Run Scalpel on an an unmatched tumour sample and call somatic variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    cwd = os.getcwd()
    output_dir = os.path.join(cwd, "{}-scalpel-output".format(name))
    scalpel_vcf = os.path.join(output_dir, "variants.indel.vcf")
    fixed_vcf = "{}.scalpel.vcf".format(name)
    logfile = "{}.scalpel.log".format(name)
    logfile2 = "{}.scalpel_fix.log".format(name)

    scalpel_command = ["{}".format(config['scalpel']['bin']),
                       "--single",
                       "--intarget",
                       # "--covthr",
                       # "3",
                       # "--lowcov",
                       # "1",
                       "--ref",
                       "{}".format(config['reference']),
                       "--bed",
                       "{}".format(samples[name]['regions']),
                       "--format",
                       "vcf",
                       "--numprocs",
                       "{}".format(config['scalpel']['num_cores']),
                       "--bam",
                       "{}".format(input_bam),
                       "--dir",
                       "{}".format(output_dir)]

    fix_sample_name_command = ["cat",
                               "{}".format(scalpel_vcf),
                               "|",
                               "sed",
                               "'s/sample/{}/g'".format(name),
                               ">",
                               "{}".format(fixed_vcf)]

    job.fileStore.logToMaster("Scalpel Command: {}\n".format(scalpel_command))
    pipeline.run_and_log_command(" ".join(scalpel_command), logfile)

    job.fileStore.logToMaster("Scalpel Fix Command: {}\n".format(fix_sample_name_command))
    pipeline.run_and_log_command(" ".join(fix_sample_name_command), logfile2)

    file_path = os.path.join(cwd, fixed_vcf)
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        return scalpel_vcf
    else:
        job.fileStore.logToMaster("Scalpel ran into a problem and no output was generated for file {}. Check logfile"
                                  "{} for details\n".format(scalpel_vcf, logfile))
        return JobException("Scalpel ran into a problem and no output was generated for file {}. Check logfile"
                            "{} for details\n".format(scalpel_vcf, logfile))
示例#3
0
def freebayes_single(job, config, name, input_bam):
    """Run FreeBayes without a matched normal sample
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    freebayes_vcf = "{}.freebayes.vcf".format(name)
    logfile = "{}.freebayes.log".format(name)

    command = ["{}".format(config['freebayes']['bin']),
               "--fasta-reference",
               "{}".format(config['reference']),
               "--min-alternate-fraction",
               "{}".format(config['min_alt_af']),
               "--pooled-discrete",
               "--pooled-continuous",
               "--genotype-qualities",
               "--report-genotype-likelihood-max",
               "--allele-balance-priors-off",
               "--use-duplicate-reads",
               "--min-repeat-entropy 1",
               "-v",
               "{}".format(freebayes_vcf),
               "{}".format(input_bam)]

    job.fileStore.logToMaster("FreeBayes Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return freebayes_vcf
示例#4
0
def scanindel(job, config, name, samples, input_bam):
    """Run ScanIndel caller for Structural Variant Detection
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples configuration dictionary.
    :type config: dict.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.scanindel.vcf".format(name)
    logfile = "{}.scanindel.log".format(name)
    sample_config_file = "{}.scanindel_sample_config.txt".format(name)

    with open(sample_config_file, 'w') as sample_config:
        sample_config.write("{id}\t{file}".format(id=name, file=input_bam))

    command = ("{}".format(config['scanindel']['bin']),
               "-i",
               "{}".format(sample_config_file),
               "-p",
               "{}".format(config['scanindel']['config_file']),
               "--bam",
               "-F",
               "{}".format(config['min_alt_af']),
               "-t",
               "{}".format(samples[name]['regions']))

    job.fileStore.logToMaster("ScanIndel Configuration Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_vcf
示例#5
0
def star_unpaired(job, config, name, samples, flags):
    """Align RNA-Seq data to a reference using STAR
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :returns:  str -- The output vcf file name.
    """

    output = "{}.star.".format(name)
    logfile = "{}.star.log".format(name)
    output_file = "{}Aligned.sortedByCoord.out.bam".format(output)

    command = ["{}".format(config['star']['bin']),
               "--genomeDir {}".format(config['star']['index']),
               "--runThreadN {}".format(config['star']['num_cores']),
               "--readFilesIn {}".format(samples[name]['fastq1']),
               "--outFileNamePrefix {}".format(output),
               "--outReadsUnmapped Fastx",
               "--outSAMtype BAM SortedByCoordinate"
               ]

    command = add_additional_options(command, config, flags)

    job.fileStore.logToMaster("STAR Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_file
示例#6
0
def sambamba_region_coverage(job, config, name, samples, input_bam):
    """Run SamBambam to calculate the coverage of targeted regions
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample/library name.
    :type name: str.
    :param input_bam: The input_bam file name to process.
    :type samples: dict
    :param samples: The samples configuration dictionary
    :type input_bam: str.
    :returns:  str -- The output BED file name.
    """

    output = "{}.sambamba_coverage.bed".format(name)
    logfile = "{}.sambamba_coverage.log".format(name)

    command = ["{}".format(config['sambamba']['bin']),
               "depth region",
               "-L",
               "{}".format(samples[name]['regions']),
               "-t",
               "{}".format(config['sambamba']['num_cores']),
               "-T",
               "{}".format(config['coverage_threshold']),
               "-T",
               "{}".format(config['coverage_threshold2']),
               "{}".format(input_bam),
               ">",
               "{}".format(output)]

    job.fileStore.logToMaster("SamBamba Coverage Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output
示例#7
0
def rapmap_quasi_paired(job, config, name, samples, flags):
    """Run RapMap Quasi-Mapping procedure on paired-end sequencing data
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :returns:  str -- The output vcf file name.
    """

    output = "{}.rapmap.sam".format(name)
    logfile = "{}.rapmap_quasi.log".format(name)

    command = ["{} quasimap".format(config['rapmap']['bin']),
               "-t {}".format(config['rapmap']['num_cores']),
               "-i {}".format(config['rapmap']['index']),
               "-1 {}".format(samples[name]['fastq1']),
               "-2 {}".format(samples[name]['fastq2']),
               "-o {}".format(output)
               ]

    job.fileStore.logToMaster("RapMap Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output
示例#8
0
def cuffquant(job, config, name, samples):
    """Run Cuffquant on all samples
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :returns:  str -- The directory name for the cuffquant results.
    """

    outdir = "{}_cuffquant".format(name)
    logfile = "{}.cuffquant.log".format(name)

    command = ["{}".format(config['cuffquant']['bin']),
               "-b {}".format(config['reference']),
               "-p {}".format(config['cuffquant']['num_cores']),
               "-o ./{}_cuffquant".format(name),
               "-u",
               "{}".format(config['merged_transcript_reference']),
               "{}".format(samples[name]['bam'])
               ]

    job.fileStore.logToMaster("Cuffquant Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return outdir
示例#9
0
def stringtie_merge(job, config, samples, flags, transcripts_list):
    """Perform transcript assembly and quantification with StringTie
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :param flags: Flags for extra parameter settings.
    :type flags: list.
    :returns:  str -- The transcript assembly GTF file name.
    """

    logfile = "{}.stringtie_merge.log".format(config["run_id"])
    outfile = "{}.stringtie.merged.gtf".format(config["run_id"])

    command = [
        "{}".format(config["stringtie"]["bin"]),
        "{}".format(transcripts_list),
        "--merge",
        "-p {}".format(config["stringtie"]["num_cores"]),
        "-G {}".format(config["transcript_reference"]),
        "-o {}".format(outfile),
    ]

    command = add_additional_options(command, config, flags)

    job.fileStore.logToMaster("StringTie Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return outfile
示例#10
0
def bowtie_paired(job, config, name, samples, flags):
    """Align RNA-Seq data to a reference using Bowtie2
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :returns:  str -- The output vcf file name.
    """

    output = "{}.bowtie.sam".format(name)
    logfile = "{}.bowtie.log".format(name)

    command = ["{}".format(config['bowtie']['bin']),
               "-x {}".format(config['bowtie']['index']),
               "-p {}".format(config['bowtie']['num_cores']),
               "-1 {}".format(samples[name]['fastq1']),
               "-2 {}".format(samples[name]['fastq2']),
               "-S {}".format(output)
               ]

    command = add_additional_options(command, config, flags)

    job.fileStore.logToMaster("Bowtie Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output
示例#11
0
def salmonVB_unpaired(job, config, name, samples):
    """Run Salmon Quasi-Mapping with single-end data using the VB optimization algorithm
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :returns:  str -- The output vcf file name.
    """

    output_dir = "{}.salmon.output".format(name)
    logfile = "{}.salmon.log".format(name)

    command = ["{} quant".format(config['salmon']['bin']),
               "-i {}".format(config['salmon']['index']),
               "-l {}".format(samples[name]['library_type']),
               "-p {}".format(config['salmon']['num_cores']),
               "--useVBOpt",
               "--numBootstraps {}".format(config['salmon']['num_bootstraps']),
               "--biasCorrect",
               "--useFSPD",
               "-r {}".format(samples[name]['fastq1']),
               "-o {}".format(output_dir)
               ]

    job.fileStore.logToMaster("Salmon Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_dir
示例#12
0
def cuffmerge(job, config, name, samples, manifest):
    """Merge assembled cufflinks transcriptomes from all samples
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: Samples config data
    :type samples: dict.
    :returns:  str -- The merged output transcriptome from cufflinks.
    """

    stats_root = "{}_cuffmerge_stats".format(config['run_id'])
    logfile = "{}.cuffmerge.log".format(config['run_id'])

    command = ["{}".format(config['cuffmerge']['bin']),
               "-g {}".format(config['transcript_reference']),
               "-s {}".format(config['reference']),
               "-p {}".format(config['cuffmerge']['num_cores']),
               "{}".format(manifest)]

    job.fileStore.logToMaster("Cuffmerge Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    pwd = os.getcwd()
    config['merged_transcript_reference'] = os.path.join(pwd, "merged.gtf")

    return stats_root
示例#13
0
def bedtools_coverage_per_site(job, config, name, input_bam):
    """Run BedTools to calculate the per-site coverage of targeted regions
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output BED file name.
    """

    output = "{}.bedtools_coverage_per_site.bed".format(name)
    logfile = "{}.bedtools_coverage.log".format(name)

    coverage = [
        "{}".format(config["bedtools"]["bin"]),
        "coverage",
        "-d",
        "-a",
        "{}".format(config["regions"]),
        "-b",
        "{}".format(input_bam),
        ">",
        "{}".format(output),
    ]

    job.fileStore.logToMaster("BedTools Coverage Command: {}\n".format(coverage))
    pipeline.run_and_log_command(" ".join(coverage), logfile)

    return output
示例#14
0
def run_lowfreq(job, config, name, input_bam):
    """Run LoFreq on an an unmatched tumour sample and call somatic variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    vcf = "{}.lofreq.vcf".format(name)
    logfile = "{}.lofreq.log".format(name)

    command = [
        "{}".format(config["lofreq"]["bin"]),
        "somatic",
        "-t",
        "{}".format(input_bam),
        "--call-indels" "-f",
        "{}".format(config["reference"]),
        "--threads",
        "{}".format(config["lofreq"]["num_cores"]),
        "-d",
        "{}".format(config["dbsnp"]),
        "-o",
        "{}".format(vcf),
    ]

    job.fileStore.logToMaster("LoFreq Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)
示例#15
0
def run_pindel(job, config, name, input_bam):
    """Run Pindel caller for InDel Detection
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str..
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    pindel_config = "{}.pindel_config.txt".format(name)
    output_dir = "{}_pindel".format(name)
    output_vcf = "{}.pindel.vcf".format(name)

    logfile = "{}.pindel.log".format(name)
    vcf_logfile = "{}.pindel2vcf.log".format(name)

    with open(pindel_config, 'w') as bam_config:
        bam_config.write("%s %s %s\n" % (input_bam, config['insert_size'], name))

    command = ("{}".format(config['pindel']['bin']),
               "-f",
               "{}".format(config['reference']),
               "-c",
               "ALL",
               "-w",
               "{}".format(config['pindel']['window']),
               "-E",
               "{}".format(config['pindel']['sensitivity']),
               "-T",
               "{}".format(config['pindel']['num_cores']),
               "-o",
               "{}".format(output_dir),
               "-i",
               "{}".format(pindel_config))

    pindel2vcf_command = ("{}".format(config['pindel2vcf']['bin']),
                          "-r",
                          "{}".format(config['reference']),
                          "-R",
                          "{}".format(config['snpeff']['reference']),
                          "-d",
                          "{}".format(config['snpeff']['reference']),
                          "-he",
                          "0.01",
                          "-G",
                          "-P",
                          "{}".format(output_dir),
                          "-v",
                          "{}".format(output_vcf))

    job.fileStore.logToMaster("Pindel Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    job.fileStore.logToMaster("Pindel2vcf Command: {}\n".format(pindel2vcf_command))
    pipeline.run_and_log_command(" ".join(pindel2vcf_command), vcf_logfile)

    return output_vcf
示例#16
0
def run_bwa_mem(job, config, name, samples):
    """Run GATK's DiagnoseTargets against the supplied region

    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param fastq1: Input FastQ File.
    :type fastq1: str.
    :param fastq2: Input FastQ File.
    :type fastq2: str.
    :returns:  str -- Aligned and sorted BAM file name.

    """

    job.fileStore.logToMaster("Running BWA for sample {}\n".format(name))

    output_bam = "{}.bwa.sorted.bam".format(name)
    temp = "{}.bwa.sort.temp".format(name)
    logfile = "{}.bwa-align.log".format(name)

    bwa_cmd = [
        "{}".format(config["bwa"]["bin"]),
        "mem",
        "-t",
        "{}".format(config["bwa"]["num_cores"]),
        "-M",
        "-v",
        "2",
        "{}".format(config["reference"]),
        "{}".format(samples[name]["fastq1"]),
        "{}".format(samples[name]["fastq2"]),
    ]

    view_cmd = ["{}".format(config["samtools"]["bin"]), "view", "-u", "-"]

    sort_cmd = [
        "{}".format(config["samtools"]["bin"]),
        "sort",
        "-@",
        "{}".format(config["bwa"]["num_cores"]),
        "-O",
        "bam",
        "-o",
        "{}".format(output_bam),
        "-T",
        "{}".format(temp),
        "-",
    ]

    command = "{} | {} | {}".format(" ".join(bwa_cmd), " ".join(view_cmd), " ".join(sort_cmd))

    job.fileStore.logToMaster("BWA Command: {}\n".format(command))
    pipeline.run_and_log_command(command, logfile)

    return output_bam
示例#17
0
def vardict_single(job, config, name, samples, input_bam):
    """Run VarDict on an an unmatched tumour sample and call somatic variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    vardict_vcf = "{}.vardict.vcf".format(name)
    logfile = "{}.vardict.log".format(name)

    vardict = ["{}".format(config['vardict']['bin']),
               "-G",
               "{}".format(config['reference']),
               "-z",
               "-c",
               "1",
               "-S",
               "2",
               "-E",
               "3",
               "-g",
               "4",
               "-B",
               "{}".format(config['vardict']['num_cores']),
               # "-a", the amplicon flag seems to be creating errors
               # "-F 0", Probably don't need this as duplicates aren't marked and ignoring secondary alignment good
               "-f",
               "{}".format(config['min_alt_af']),
               "-N",
               "{}".format(name),
               "-b",
               "{}".format(input_bam),
               "{}".format(samples[name]['regions'])]

    vardict2vcf = ["{}".format(config['vardict2vcf']['bin']),
                   "-E",
                   "-f",
                   "{}".format(config['min_alt_af']),
                   "-N",
                   "{}".format(name)]

    vcfsort = ["{}".format(config['vcftools_sort']['bin']),
               "-c"]

    command = ("{vardict} | {strandbias} | {vardict2vcf} | "
               "{sort} > {vcf}".format(vardict=" ".join(vardict), strandbias=config['vardict_strandbias']['bin'],
                                       vardict2vcf=" ".join(vardict2vcf), sort=" ".join(vcfsort), vcf=vardict_vcf))

    job.fileStore.logToMaster("VarDict Command: {}\n".format(command))
    pipeline.run_and_log_command(command, logfile)

    return vardict_vcf
示例#18
0
def hisat_unpaired(job, config, name, samples, flags):
    """Align RNA-Seq data to a reference using HiSat2
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :returns:  str -- The output bam file name.
    """

    working_dir = os.getcwd()

    logfile = "{}.hisat.log".format(name)
    output = "{}.hisat.sorted.bam".format(name)
    unaligned = os.path.join(working_dir, "{}.unaligned.sam".format(name))
    temp = "{}.hisat.sort.temp".format(name)

    hisat_cmd = ["{}".format(config['hisat']['bin']),
                 "-p {}".format(config['hisat']['num_cores']),
                 "--dta",
                 "-x {}".format(config['hisat']['index']),
                 "-U {}".format(samples[name]['fastq1']),
                 "--un {}".format(unaligned)
                 ]

    hisat_cmd = add_additional_options(hisat_cmd, config, flags)

    view_cmd = ["{}".format(config['samtools']['bin']),
                "view",
                "-u",
                "-"]

    sort_cmd = ["{}".format(config['samtools']['bin']),
                "sort",
                "-@",
                "{}".format(config['hisat']['num_cores']),
                "-O",
                "bam",
                "-o",
                "{}".format(output),
                "-T",
                "{}".format(temp),
                "-"]

    command = "{} | {} | {}".format(" ".join(hisat_cmd), " ".join(view_cmd), " ".join(sort_cmd))

    job.fileStore.logToMaster("HiSat2 Command: {}\n".format(command))
    pipeline.run_and_log_command(command, logfile)

    return output
示例#19
0
def bcftools_filter_variants_regions(job, config, name, samples, input_vcf):
    """Use bcftools to filter vcf file to only variants found within the specified regions file
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output vcf file name.
    """

    filtered_vcf = "{}.on_target.vcf".format(name)
    sorted_vcf = "{}.on_target_sorted.vcf".format(name)
    bgzipped_vcf = "{}.gz".format(input_vcf)
    logfile = "{}.on_target_filter.log".format(name)
    sort_logfile = "{}.on_target_sorted.log".format(name)

    bgzip_and_tabix_vcf(job, input_vcf)

    filter_command = [
        "{}".format(config["bcftools"]["bin"]),
        "isec",
        "-T",
        "{}".format(samples[name]["regions"]),
        "{}".format(bgzipped_vcf),
        ">",
        "{}".format(filtered_vcf),
    ]

    sort_command = [
        "cat",
        "{}".format(filtered_vcf),
        "|",
        "{}".format(config["vcftools_sort"]["bin"]),
        "-c",
        ">",
        "{}".format(sorted_vcf),
    ]

    job.fileStore.logToMaster("BCFTools isec command for filtering to only target regions: {}\n".format(filter_command))
    pipeline.run_and_log_command(" ".join(filter_command), logfile)

    job.fileStore.logToMaster("VCFTools-sort command for filtering to only target regions: {}\n".format(sort_command))
    pipeline.run_and_log_command(" ".join(sort_command), sort_logfile)

    return sorted_vcf
示例#20
0
def bgzip_and_tabix_vcf(job, infile):
    """Run BGZip and Tabix on the specified VCF
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param infile: The input_vcf file name to process.
    :type infile: str.
    :returns:  str -- The output vcf file name.
    """

    bgzip_instructions, tabix_instructions = _bgzip_and_tabix_vcf_instructions(infile)

    job.fileStore.logToMaster("BGzip Command: {}\n".format(bgzip_instructions[0]))
    pipeline.run_and_log_command(bgzip_instructions[0], bgzip_instructions[1])

    job.fileStore.logToMaster("Tabix Command: {}\n".format(tabix_instructions[0]))
    pipeline.run_and_log_command(tabix_instructions[0], tabix_instructions[1])
示例#21
0
def convert2pe(job, row):
    bamfile = row[0]
    elements = bamfile.split('.')
    lane_id = elements[2]
    sample_id = elements[4]

    outfile1 = "{}.{}.R1.fastq".format(sample_id, lane_id)
    outfile2 = "{}.{}.R2.fastq".format(sample_id, lane_id)

    logfile = "convert_{}.log".format(bamfile)

    command = ("bedtools bamtofastq",
               "-i {}".format(bamfile),
               "-fq {}".format(outfile1),
               "-fq2 {}".format(outfile2))

    job.fileStore.logToMaster("Running command {} and logging to {}\n".format(command, logfile))
    pipeline.run_and_log_command(" ".join(command), logfile)
示例#22
0
def pisces(job, config, name, input_bam):
    """Run Pisces on a single sample
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.pisces.vcf".format(name)
    logfile = "{}.pisces.log".format(name)
    command = ["{}".format(config['pisces']['bin']),
               "-B",
               "-t",
               "{}".format(config['pisces']['num_cores']),
               "-ThreadByChr",
               "{}".format(input_bam),
               "-g",
               "{}".format(config['reference']),
               "-f",
               "{}".format(config['min_alt_af']),
               "-b",
               "{}".format(config['min_bq']),
               "-fo",
               "False",
               "-q",
               "{}".format(config['max_var_qscore']),
               "-c",
               "{}".format(config['coverage_threshold']),
               "-s",
               "{}".format(config['sb_threshold']),
               "-a",
               "{}".format(config['min_var_qscore']),
               "-F",
               "{}".format(config['var_qscore_threshold']),
               "-gVCF",
               "True"]

    job.fileStore.logToMaster("Pisces Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_vcf
示例#23
0
def mutect2_single(job, config, name, samples, input_bam):
    """Run MuTect on an an unmatched tumour sample and call somatic variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    mutect_vcf = "{}.mutect2.vcf".format(name)
    mutect_logfile = "{}.mutect2.log".format(name)

    mutect_command = [
        "{}".format(config["gatk3.5"]["bin"]),
        "-T",
        "MuTect2",
        "-R",
        "{}".format(config["reference"]),
        "--dbsnp",
        "{}".format(config["dbsnp"]),
        "--cosmic",
        "{}".format(config["cosmic"]),
        "-drf DuplicateRead",
        "-ip 100",
        "-L",
        "{}".format(samples[name]["regions"]),
        "-nct",
        "{}".format(config["gatk3.5"]["num_cores"]),
        "-I:tumor",
        "{}".format(input_bam),
        "-o",
        "{}".format(mutect_vcf),
    ]

    job.fileStore.logToMaster("MuTect2 Command: {}\n".format(mutect_command))
    pipeline.run_and_log_command(" ".join(mutect_command), mutect_logfile)

    # job.fileStore.logToMaster("Subset Command: {}\n".format(subset_command))
    # pipeline.run_and_log_command(" ".join(subset_command), subset_log)

    return mutect_vcf
示例#24
0
def stringtie(job, config, name, samples, flags):
    """Perform transcript assembly and quantification with StringTie
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param samples: The samples info and config dictionary.
    :type samples: dict.
    :param flags: Flags for extra parameter settings.
    :type flags: list.
    :returns:  str -- The transcript assembly GTF file name.
    """

    logfile = "{}.stringtie.log".format(name)
    outfile = "{}.stringtie.gtf".format(name)
    abundances_file = "{}.gene_abundances.txt".format(name)

    outdir = "{}_stringtie_final".format(name)

    working_dir = os.getcwd()
    full_path_outfile = os.path.join(working_dir, outdir, outfile)

    command = [
        "{}".format(config["stringtie"]["bin"]),
        "{}".format(samples[name]["bam"]),
        "-p {}".format(config["stringtie"]["num_cores"]),
        "-G {}".format(config["merged_transcript_reference"]),
        "-A {}".format(abundances_file),
        "-f 0.05",
        "-m 100",
        "-B",
        "-e",
        "-o {}".format(full_path_outfile),
    ]

    command = add_additional_options(command, config, flags)

    job.fileStore.logToMaster("StringTie Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return outfile
示例#25
0
def run_delly2_single(job, config, name, input_bam):
    """Run delly2 for structural variant detection. As delly2 is parallelized on the level of samples,
    we use a single-threaded version
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The merged Delly output vcf file name.
    """

    delly_vcfs = list()
    delly_command_core = ("{}".format(config['delly']['bin']),
                          "-x",
                          "{}".format(config['delly']['exclude']),
                          "-g",
                          "{}".format(config['reference']))

    for mut_type in ["DEL", "DUP", "TRA", "INV"]:
        output_vcf = "{sample}.{type}.vcf".format(sample=name, type=mut_type)
        logfile = "{sample}.{type}.log".format(sample=name, type=mut_type)

        delly_vcfs.append(output_vcf)

        delly_command = list()
        delly_command.append(delly_command_core)
        delly_command.append("-t",
                             "{}".format(mut_type),
                             "-o",
                             "{}".format(output_vcf),
                             "{}".format(input_bam))

        job.fileStore.logToMaster("Running Delly: {}\n".format(delly_command))
        pipeline.run_and_log_command(" ".join(delly_command), logfile)

    job.fileStore.logToMaster("Merging delly output with command: {}\n".format(merge_command))
    pipeline.run_and_log_command(" ".join(merge_command), merge_log)

    return merged_vcf
示例#26
0
def cufflinks(job, config, name, samples):
    """Transcriptome assembly with cufflinks
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :param input_bam: The input bam file.
    :type input_bam: str.
    :returns:  str -- The output transcriptome from cufflinks.
    """

    outdir = "{}_cufflinks".format(name)
    logfile = "{}.cufflinks.log".format(name)

    working_dir = os.getcwd()
    path = os.path.join(working_dir, outdir)
    try:
        os.mkdir(path)
    except:
        sys.stderr.write("Directory {} already exists. Not creating...\n".format(path))

    os.chdir(path)

    command = ["{}".format(config['cufflinks']['bin']),
               "-g {}".format(config['transcript_reference']),
               "-b {}".format(config['reference']),
               "-u",
               "-p {}".format(config['cufflinks']['num_cores']),
               "--library-type {}".format(samples[name]['cufflinks_lib']),
               "{}".format(samples[name]['bam'])]

    if not os.path.isfile("transcripts.gtf"):
        job.fileStore.logToMaster("Cufflinks Command: {}\n".format(command))
        pipeline.run_and_log_command(" ".join(command), logfile)
    else:
        job.fileStore.logToMaster("Cufflinks appears to have already executed for {}. Skipping...\n".format(name))

    os.chdir(working_dir)

    return path
示例#27
0
def vt_normalization(job, config, sample, caller, input_vcf):
    """Decompose and left normalize variants
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param sample: caller name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.{}.normalized.vcf".format(sample, caller)
    logfile = "{}.{}.vt_normalization.log".format(sample, caller)

    normalization = ["zless",
                     "{}".format(input_vcf),
                     "|",
                     "sed",
                     "'s/ID=AD,Number=./ID=AD,Number=R/'",
                     "|",
                     "{}".format(config['vt']['bin']),
                     "decompose",
                     "-s",
                     "-",
                     "|",
                     "{}".format(config['vt']['bin']),
                     "normalize",
                     "-r",
                     "{}".format(config['reference']),
                     "-",
                     ">",
                     "{}".format(output_vcf)]

    job.fileStore.logToMaster("VT Command: {}\n".format(normalization))
    pipeline.run_and_log_command(" ".join(normalization), logfile)

    return output_vcf
示例#28
0
def run_flt3_itdseek(job, config, name):
    """Run ITDseek without a matched normal sample
    :param config: The configuration dictionary.
    :type config: dict.
    :param name: sample name.
    :type name: str.
    :returns:  str -- The output vcf file name.
    """

    itdseek_vcf = "{}.flt3.itdseek.vcf".format(name)
    itdseek_logfile = "{}.flt3.itdseek.log".format(name)

    itdseek_command = ["{}".format(config['itdseek']['bin']),
                       "{}.rg.sorted.bam".format(name),
                       "{}".format(config['reference']),
                       "{}".format(config['samtools-0.19']['bin']),
                       ">",
                       "{}".format(itdseek_vcf)]

    job.fileStore.logToMaster("ITDSeek Command: {}\n".format(itdseek_command))
    pipeline.run_and_log_command(" ".join(itdseek_command), itdseek_logfile)

    return itdseek_vcf
示例#29
0
文件: qc.py 项目: dgaston/ddb-ngsflow
def run_fastqc(job, config, samples):
    """Run FastQC on provided FastQ files
    :param config: The configuration dictionary.
    :type config: dict.
    :param samples: Samples dictionary
    :type samples: str.
    """

    job.fileStore.logToMaster("Running FastQC for all samples\n")
    logfile = "fastqc.log"

    fastq_files_list = list()
    for sample in samples:
        fastq_files_list.append(samples[sample]['fastq1'])
        fastq_files_list.append(samples[sample]['fastq2'])

    fastq_files_string = " ".join(fastq_files_list)
    command = ["{}".format(config['fastqc']['bin']),
               "{}".format(fastq_files_string),
               "--extract"]

    job.fileStore.logToMaster("FastQC Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)
示例#30
0
def joint_variant_calling(job, config, name, samples):
    """Create a cohort VCF file based on joint calling from gVCF files
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param samples: samples configuration dictionary
    :type samples: dict
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    vcf = "{}.haplotypecaller.vcf".format(name)
    logfile = "{}.haplotypecaller_gvcf.log".format(name)

    gvcfs = list()
    for sample in samples:
        gvcfs.append("--variant {}.haplotypecaller.g.vcf".format(sample))

    gvcf_string = " ".join(gvcfs)

    command = ["{}".format(config['gatk-jointgenotyper']['bin']),
               "-T",
               "GenotypeGVCFs",
               "-R",
               "{}".format(config['reference']),
               "{}".format(gvcf_string),
               "-nt",
               "{}".format(config['gatk-jointgenotyper']['num_cores']),
               "-o",
               "{}".format(vcf)]

    job.fileStore.logToMaster("GenotypeVCFs Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return vcf
示例#31
0
def run_fundi(job, root_name):
    """Take the specified VCF and use vcfanno to add additional annotations
       :param config: The configuration dictionary.
       :type config: dict.
       :param sample: sample name.
       :type sample: str.
       :param input_vcf: The input_vcf file name to process.
       :type input_vcf: str.
       :returns:  str -- The output vcf file name.
       """

    logfile = "{}.fundi.log".format(root_name)

    command = ["perl ./FunDi.pl",
               "-a",
               "{}.aa_modified_nodash.phy".format(root_name),
               "-o",
               "{}.aa_modified_nodash_subtree".format(root_name),
               "-m LG+F+G",
               "-s",
               "{}.nh.def".format(root_name),
               "-P iqtree",
               "-r 4",
               "-t",
               "{}.nh.newick".format(root_name),
               "-N 22"]

    mv_fundi_log = "mv FunDi.log {}_FunDi.log".format(root_name)

    job.fileStore.logToMaster("FunDi Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    job.fileStore.logToMaster("Rename file Command: {}\n".format(command))
    pipeline.run_and_log_command(mv_fundi_log, logfile)

    return logfile
示例#32
0
def subsample_bam(job, addresses, keyspace, auth, name, samples, config, seed,
                  fraction, iteration):
    """Use samtools view to subsample an input file to the specified fraction"""

    library_name = "subsample-{}-{}-{}".format(samples[name]['library_name'],
                                               fraction, iteration)
    sublog = "subsample-{}-{}-{}.log".format(name, fraction, iteration)
    input_bam = "{}.recalibrated.sorted.bam".format(
        samples[name]['library_name'])
    subsampled_bam = "subsample-{}-{}-{}.bam".format(
        samples[name]['library_name'], fraction, iteration)
    samcommand = "samtools view -s {seed}.{fraction} -b {input} > {output}".format(
        seed=seed, fraction=fraction, input=input_bam, output=subsampled_bam)

    index_command = "samtools index {}".format(subsampled_bam)
    index_log = "{}.index.log".format(subsampled_bam)

    output = "{}.sambamba_coverage.bed".format(subsampled_bam)
    logfile = "{}.sambamba_coverage.log".format(subsampled_bam)

    command = ("{}".format(config['sambamba']['bin']), "depth region", "-L",
               "{}".format(samples[name]['regions']), "-t",
               "{}".format(config['sambamba']['num_cores']), "-T",
               "{}".format(config['coverage_threshold']), "-T",
               "{}".format(config['coverage_threshold2']),
               "{}".format(subsampled_bam), ">", "{}".format(output))

    job.fileStore.logToMaster("Samtools ViewCommand: {}\n".format(samcommand))
    pipeline.run_and_log_command(samcommand, sublog)

    job.fileStore.logToMaster(
        "Samtools Index Command: {}\n".format(index_command))
    pipeline.run_and_log_command(index_command, index_log)

    job.fileStore.logToMaster(
        "SamBamba Coverage Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    connection.setup(addresses, keyspace, auth_provider=auth)

    job.fileStore.logToMaster("Adding coverage data: {}\n".format(samcommand))

    num_libs = (float(samples[name]['num_libraries_in_run']) *
                (1 / (float(fraction) / 100.00)))
    with open(output, 'rb') as coverage:
        reader = csv.reader(coverage, delimiter='\t')
        header = reader.next()
        threshold_indices = list()
        thresholds = list()
        index = 0
        for element in header:
            if element.startswith("percentage"):
                threshold = element.replace('percentage', '')
                threshold_indices.append(index)
                thresholds.append(int(threshold))
            index += 1

        for row in reader:
            threshold_data = defaultdict(float)
            index = 0
            for threshold in thresholds:
                threshold_data[threshold] = row[threshold_indices[index]]
                index += 1

            sample_data = SampleCoverage.create(
                sample=samples[name]['sample_name'],
                library_name=library_name,
                run_id="subsample-{}".format(fraction),
                num_libraries_in_run=num_libs,
                sequencer_id=samples[name]['sequencer'],
                program_name="sambamba",
                extraction=samples[name]['extraction'],
                panel=samples[name]['panel'],
                target_pool=samples[name]['target_pool'],
                amplicon=row[3],
                num_reads=row[4],
                mean_coverage=row[5],
                thresholds=thresholds,
                perc_bp_cov_at_thresholds=threshold_data)

            amplicon_data = AmpliconCoverage.create(
                amplicon=row[3],
                sample=samples[name]['sample_name'],
                library_name=library_name,
                run_id="subsample-{}".format(fraction),
                num_libraries_in_run=num_libs,
                sequencer_id=samples[name]['sequencer'],
                program_name="sambamba",
                extraction=samples[name]['extraction'],
                panel=samples[name]['panel'],
                target_pool=samples[name]['target_pool'],
                num_reads=row[4],
                mean_coverage=row[5],
                thresholds=thresholds,
                perc_bp_cov_at_thresholds=threshold_data)