def platypus_single(job, config, name, samples, input_bam): """Run Platypus on an an unmatched tumour sample and call somatic variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ platypus_vcf = "{}.platypus.vcf".format(name) platypus_log = "{}.platypus.log".format(name) internal_log = "{}.platypus_internal.log".format(name) platypus_command = ["{}".format(config['platypus']['bin']), "callVariants", "--refFile={}".format(config['reference']), "--regions={}".format(samples[name]['regions']), "--assemble=1", "--assembleBadReads=1", "--assembleBrokenPairs=1", "--filterDuplicates=0", "--minVarFreq={}".format(config['min_alt_af']), "--nCPU={}".format(config['platypus']['num_cores']), "--logFileName={}".format(internal_log), "--bamFiles={}".format(input_bam), "--output={}".format(platypus_vcf)] job.fileStore.logToMaster("Platypus Command: {}\n".format(platypus_command)) pipeline.run_and_log_command(" ".join(platypus_command), platypus_log) return platypus_vcf
def scalpel_single(job, config, name, samples, input_bam): """Run Scalpel on an an unmatched tumour sample and call somatic variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ cwd = os.getcwd() output_dir = os.path.join(cwd, "{}-scalpel-output".format(name)) scalpel_vcf = os.path.join(output_dir, "variants.indel.vcf") fixed_vcf = "{}.scalpel.vcf".format(name) logfile = "{}.scalpel.log".format(name) logfile2 = "{}.scalpel_fix.log".format(name) scalpel_command = ["{}".format(config['scalpel']['bin']), "--single", "--intarget", # "--covthr", # "3", # "--lowcov", # "1", "--ref", "{}".format(config['reference']), "--bed", "{}".format(samples[name]['regions']), "--format", "vcf", "--numprocs", "{}".format(config['scalpel']['num_cores']), "--bam", "{}".format(input_bam), "--dir", "{}".format(output_dir)] fix_sample_name_command = ["cat", "{}".format(scalpel_vcf), "|", "sed", "'s/sample/{}/g'".format(name), ">", "{}".format(fixed_vcf)] job.fileStore.logToMaster("Scalpel Command: {}\n".format(scalpel_command)) pipeline.run_and_log_command(" ".join(scalpel_command), logfile) job.fileStore.logToMaster("Scalpel Fix Command: {}\n".format(fix_sample_name_command)) pipeline.run_and_log_command(" ".join(fix_sample_name_command), logfile2) file_path = os.path.join(cwd, fixed_vcf) if os.path.exists(file_path) and os.path.getsize(file_path) > 0: return scalpel_vcf else: job.fileStore.logToMaster("Scalpel ran into a problem and no output was generated for file {}. Check logfile" "{} for details\n".format(scalpel_vcf, logfile)) return JobException("Scalpel ran into a problem and no output was generated for file {}. Check logfile" "{} for details\n".format(scalpel_vcf, logfile))
def freebayes_single(job, config, name, input_bam): """Run FreeBayes without a matched normal sample :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ freebayes_vcf = "{}.freebayes.vcf".format(name) logfile = "{}.freebayes.log".format(name) command = ["{}".format(config['freebayes']['bin']), "--fasta-reference", "{}".format(config['reference']), "--min-alternate-fraction", "{}".format(config['min_alt_af']), "--pooled-discrete", "--pooled-continuous", "--genotype-qualities", "--report-genotype-likelihood-max", "--allele-balance-priors-off", "--use-duplicate-reads", "--min-repeat-entropy 1", "-v", "{}".format(freebayes_vcf), "{}".format(input_bam)] job.fileStore.logToMaster("FreeBayes Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return freebayes_vcf
def scanindel(job, config, name, samples, input_bam): """Run ScanIndel caller for Structural Variant Detection :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples configuration dictionary. :type config: dict. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ output_vcf = "{}.scanindel.vcf".format(name) logfile = "{}.scanindel.log".format(name) sample_config_file = "{}.scanindel_sample_config.txt".format(name) with open(sample_config_file, 'w') as sample_config: sample_config.write("{id}\t{file}".format(id=name, file=input_bam)) command = ("{}".format(config['scanindel']['bin']), "-i", "{}".format(sample_config_file), "-p", "{}".format(config['scanindel']['config_file']), "--bam", "-F", "{}".format(config['min_alt_af']), "-t", "{}".format(samples[name]['regions'])) job.fileStore.logToMaster("ScanIndel Configuration Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output_vcf
def star_unpaired(job, config, name, samples, flags): """Align RNA-Seq data to a reference using STAR :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :returns: str -- The output vcf file name. """ output = "{}.star.".format(name) logfile = "{}.star.log".format(name) output_file = "{}Aligned.sortedByCoord.out.bam".format(output) command = ["{}".format(config['star']['bin']), "--genomeDir {}".format(config['star']['index']), "--runThreadN {}".format(config['star']['num_cores']), "--readFilesIn {}".format(samples[name]['fastq1']), "--outFileNamePrefix {}".format(output), "--outReadsUnmapped Fastx", "--outSAMtype BAM SortedByCoordinate" ] command = add_additional_options(command, config, flags) job.fileStore.logToMaster("STAR Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output_file
def sambamba_region_coverage(job, config, name, samples, input_bam): """Run SamBambam to calculate the coverage of targeted regions :param config: The configuration dictionary. :type config: dict. :param name: sample/library name. :type name: str. :param input_bam: The input_bam file name to process. :type samples: dict :param samples: The samples configuration dictionary :type input_bam: str. :returns: str -- The output BED file name. """ output = "{}.sambamba_coverage.bed".format(name) logfile = "{}.sambamba_coverage.log".format(name) command = ["{}".format(config['sambamba']['bin']), "depth region", "-L", "{}".format(samples[name]['regions']), "-t", "{}".format(config['sambamba']['num_cores']), "-T", "{}".format(config['coverage_threshold']), "-T", "{}".format(config['coverage_threshold2']), "{}".format(input_bam), ">", "{}".format(output)] job.fileStore.logToMaster("SamBamba Coverage Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output
def rapmap_quasi_paired(job, config, name, samples, flags): """Run RapMap Quasi-Mapping procedure on paired-end sequencing data :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :returns: str -- The output vcf file name. """ output = "{}.rapmap.sam".format(name) logfile = "{}.rapmap_quasi.log".format(name) command = ["{} quasimap".format(config['rapmap']['bin']), "-t {}".format(config['rapmap']['num_cores']), "-i {}".format(config['rapmap']['index']), "-1 {}".format(samples[name]['fastq1']), "-2 {}".format(samples[name]['fastq2']), "-o {}".format(output) ] job.fileStore.logToMaster("RapMap Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output
def cuffquant(job, config, name, samples): """Run Cuffquant on all samples :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :returns: str -- The directory name for the cuffquant results. """ outdir = "{}_cuffquant".format(name) logfile = "{}.cuffquant.log".format(name) command = ["{}".format(config['cuffquant']['bin']), "-b {}".format(config['reference']), "-p {}".format(config['cuffquant']['num_cores']), "-o ./{}_cuffquant".format(name), "-u", "{}".format(config['merged_transcript_reference']), "{}".format(samples[name]['bam']) ] job.fileStore.logToMaster("Cuffquant Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return outdir
def stringtie_merge(job, config, samples, flags, transcripts_list): """Perform transcript assembly and quantification with StringTie :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :param flags: Flags for extra parameter settings. :type flags: list. :returns: str -- The transcript assembly GTF file name. """ logfile = "{}.stringtie_merge.log".format(config["run_id"]) outfile = "{}.stringtie.merged.gtf".format(config["run_id"]) command = [ "{}".format(config["stringtie"]["bin"]), "{}".format(transcripts_list), "--merge", "-p {}".format(config["stringtie"]["num_cores"]), "-G {}".format(config["transcript_reference"]), "-o {}".format(outfile), ] command = add_additional_options(command, config, flags) job.fileStore.logToMaster("StringTie Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return outfile
def bowtie_paired(job, config, name, samples, flags): """Align RNA-Seq data to a reference using Bowtie2 :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :returns: str -- The output vcf file name. """ output = "{}.bowtie.sam".format(name) logfile = "{}.bowtie.log".format(name) command = ["{}".format(config['bowtie']['bin']), "-x {}".format(config['bowtie']['index']), "-p {}".format(config['bowtie']['num_cores']), "-1 {}".format(samples[name]['fastq1']), "-2 {}".format(samples[name]['fastq2']), "-S {}".format(output) ] command = add_additional_options(command, config, flags) job.fileStore.logToMaster("Bowtie Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output
def salmonVB_unpaired(job, config, name, samples): """Run Salmon Quasi-Mapping with single-end data using the VB optimization algorithm :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :returns: str -- The output vcf file name. """ output_dir = "{}.salmon.output".format(name) logfile = "{}.salmon.log".format(name) command = ["{} quant".format(config['salmon']['bin']), "-i {}".format(config['salmon']['index']), "-l {}".format(samples[name]['library_type']), "-p {}".format(config['salmon']['num_cores']), "--useVBOpt", "--numBootstraps {}".format(config['salmon']['num_bootstraps']), "--biasCorrect", "--useFSPD", "-r {}".format(samples[name]['fastq1']), "-o {}".format(output_dir) ] job.fileStore.logToMaster("Salmon Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output_dir
def cuffmerge(job, config, name, samples, manifest): """Merge assembled cufflinks transcriptomes from all samples :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: Samples config data :type samples: dict. :returns: str -- The merged output transcriptome from cufflinks. """ stats_root = "{}_cuffmerge_stats".format(config['run_id']) logfile = "{}.cuffmerge.log".format(config['run_id']) command = ["{}".format(config['cuffmerge']['bin']), "-g {}".format(config['transcript_reference']), "-s {}".format(config['reference']), "-p {}".format(config['cuffmerge']['num_cores']), "{}".format(manifest)] job.fileStore.logToMaster("Cuffmerge Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) pwd = os.getcwd() config['merged_transcript_reference'] = os.path.join(pwd, "merged.gtf") return stats_root
def bedtools_coverage_per_site(job, config, name, input_bam): """Run BedTools to calculate the per-site coverage of targeted regions :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output BED file name. """ output = "{}.bedtools_coverage_per_site.bed".format(name) logfile = "{}.bedtools_coverage.log".format(name) coverage = [ "{}".format(config["bedtools"]["bin"]), "coverage", "-d", "-a", "{}".format(config["regions"]), "-b", "{}".format(input_bam), ">", "{}".format(output), ] job.fileStore.logToMaster("BedTools Coverage Command: {}\n".format(coverage)) pipeline.run_and_log_command(" ".join(coverage), logfile) return output
def run_lowfreq(job, config, name, input_bam): """Run LoFreq on an an unmatched tumour sample and call somatic variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ vcf = "{}.lofreq.vcf".format(name) logfile = "{}.lofreq.log".format(name) command = [ "{}".format(config["lofreq"]["bin"]), "somatic", "-t", "{}".format(input_bam), "--call-indels" "-f", "{}".format(config["reference"]), "--threads", "{}".format(config["lofreq"]["num_cores"]), "-d", "{}".format(config["dbsnp"]), "-o", "{}".format(vcf), ] job.fileStore.logToMaster("LoFreq Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile)
def run_pindel(job, config, name, input_bam): """Run Pindel caller for InDel Detection :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str.. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ pindel_config = "{}.pindel_config.txt".format(name) output_dir = "{}_pindel".format(name) output_vcf = "{}.pindel.vcf".format(name) logfile = "{}.pindel.log".format(name) vcf_logfile = "{}.pindel2vcf.log".format(name) with open(pindel_config, 'w') as bam_config: bam_config.write("%s %s %s\n" % (input_bam, config['insert_size'], name)) command = ("{}".format(config['pindel']['bin']), "-f", "{}".format(config['reference']), "-c", "ALL", "-w", "{}".format(config['pindel']['window']), "-E", "{}".format(config['pindel']['sensitivity']), "-T", "{}".format(config['pindel']['num_cores']), "-o", "{}".format(output_dir), "-i", "{}".format(pindel_config)) pindel2vcf_command = ("{}".format(config['pindel2vcf']['bin']), "-r", "{}".format(config['reference']), "-R", "{}".format(config['snpeff']['reference']), "-d", "{}".format(config['snpeff']['reference']), "-he", "0.01", "-G", "-P", "{}".format(output_dir), "-v", "{}".format(output_vcf)) job.fileStore.logToMaster("Pindel Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) job.fileStore.logToMaster("Pindel2vcf Command: {}\n".format(pindel2vcf_command)) pipeline.run_and_log_command(" ".join(pindel2vcf_command), vcf_logfile) return output_vcf
def run_bwa_mem(job, config, name, samples): """Run GATK's DiagnoseTargets against the supplied region :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param fastq1: Input FastQ File. :type fastq1: str. :param fastq2: Input FastQ File. :type fastq2: str. :returns: str -- Aligned and sorted BAM file name. """ job.fileStore.logToMaster("Running BWA for sample {}\n".format(name)) output_bam = "{}.bwa.sorted.bam".format(name) temp = "{}.bwa.sort.temp".format(name) logfile = "{}.bwa-align.log".format(name) bwa_cmd = [ "{}".format(config["bwa"]["bin"]), "mem", "-t", "{}".format(config["bwa"]["num_cores"]), "-M", "-v", "2", "{}".format(config["reference"]), "{}".format(samples[name]["fastq1"]), "{}".format(samples[name]["fastq2"]), ] view_cmd = ["{}".format(config["samtools"]["bin"]), "view", "-u", "-"] sort_cmd = [ "{}".format(config["samtools"]["bin"]), "sort", "-@", "{}".format(config["bwa"]["num_cores"]), "-O", "bam", "-o", "{}".format(output_bam), "-T", "{}".format(temp), "-", ] command = "{} | {} | {}".format(" ".join(bwa_cmd), " ".join(view_cmd), " ".join(sort_cmd)) job.fileStore.logToMaster("BWA Command: {}\n".format(command)) pipeline.run_and_log_command(command, logfile) return output_bam
def vardict_single(job, config, name, samples, input_bam): """Run VarDict on an an unmatched tumour sample and call somatic variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ vardict_vcf = "{}.vardict.vcf".format(name) logfile = "{}.vardict.log".format(name) vardict = ["{}".format(config['vardict']['bin']), "-G", "{}".format(config['reference']), "-z", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-B", "{}".format(config['vardict']['num_cores']), # "-a", the amplicon flag seems to be creating errors # "-F 0", Probably don't need this as duplicates aren't marked and ignoring secondary alignment good "-f", "{}".format(config['min_alt_af']), "-N", "{}".format(name), "-b", "{}".format(input_bam), "{}".format(samples[name]['regions'])] vardict2vcf = ["{}".format(config['vardict2vcf']['bin']), "-E", "-f", "{}".format(config['min_alt_af']), "-N", "{}".format(name)] vcfsort = ["{}".format(config['vcftools_sort']['bin']), "-c"] command = ("{vardict} | {strandbias} | {vardict2vcf} | " "{sort} > {vcf}".format(vardict=" ".join(vardict), strandbias=config['vardict_strandbias']['bin'], vardict2vcf=" ".join(vardict2vcf), sort=" ".join(vcfsort), vcf=vardict_vcf)) job.fileStore.logToMaster("VarDict Command: {}\n".format(command)) pipeline.run_and_log_command(command, logfile) return vardict_vcf
def hisat_unpaired(job, config, name, samples, flags): """Align RNA-Seq data to a reference using HiSat2 :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :returns: str -- The output bam file name. """ working_dir = os.getcwd() logfile = "{}.hisat.log".format(name) output = "{}.hisat.sorted.bam".format(name) unaligned = os.path.join(working_dir, "{}.unaligned.sam".format(name)) temp = "{}.hisat.sort.temp".format(name) hisat_cmd = ["{}".format(config['hisat']['bin']), "-p {}".format(config['hisat']['num_cores']), "--dta", "-x {}".format(config['hisat']['index']), "-U {}".format(samples[name]['fastq1']), "--un {}".format(unaligned) ] hisat_cmd = add_additional_options(hisat_cmd, config, flags) view_cmd = ["{}".format(config['samtools']['bin']), "view", "-u", "-"] sort_cmd = ["{}".format(config['samtools']['bin']), "sort", "-@", "{}".format(config['hisat']['num_cores']), "-O", "bam", "-o", "{}".format(output), "-T", "{}".format(temp), "-"] command = "{} | {} | {}".format(" ".join(hisat_cmd), " ".join(view_cmd), " ".join(sort_cmd)) job.fileStore.logToMaster("HiSat2 Command: {}\n".format(command)) pipeline.run_and_log_command(command, logfile) return output
def bcftools_filter_variants_regions(job, config, name, samples, input_vcf): """Use bcftools to filter vcf file to only variants found within the specified regions file :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_vcf: The input_vcf file name to process. :type input_vcf: str. :returns: str -- The output vcf file name. """ filtered_vcf = "{}.on_target.vcf".format(name) sorted_vcf = "{}.on_target_sorted.vcf".format(name) bgzipped_vcf = "{}.gz".format(input_vcf) logfile = "{}.on_target_filter.log".format(name) sort_logfile = "{}.on_target_sorted.log".format(name) bgzip_and_tabix_vcf(job, input_vcf) filter_command = [ "{}".format(config["bcftools"]["bin"]), "isec", "-T", "{}".format(samples[name]["regions"]), "{}".format(bgzipped_vcf), ">", "{}".format(filtered_vcf), ] sort_command = [ "cat", "{}".format(filtered_vcf), "|", "{}".format(config["vcftools_sort"]["bin"]), "-c", ">", "{}".format(sorted_vcf), ] job.fileStore.logToMaster("BCFTools isec command for filtering to only target regions: {}\n".format(filter_command)) pipeline.run_and_log_command(" ".join(filter_command), logfile) job.fileStore.logToMaster("VCFTools-sort command for filtering to only target regions: {}\n".format(sort_command)) pipeline.run_and_log_command(" ".join(sort_command), sort_logfile) return sorted_vcf
def bgzip_and_tabix_vcf(job, infile): """Run BGZip and Tabix on the specified VCF :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param infile: The input_vcf file name to process. :type infile: str. :returns: str -- The output vcf file name. """ bgzip_instructions, tabix_instructions = _bgzip_and_tabix_vcf_instructions(infile) job.fileStore.logToMaster("BGzip Command: {}\n".format(bgzip_instructions[0])) pipeline.run_and_log_command(bgzip_instructions[0], bgzip_instructions[1]) job.fileStore.logToMaster("Tabix Command: {}\n".format(tabix_instructions[0])) pipeline.run_and_log_command(tabix_instructions[0], tabix_instructions[1])
def convert2pe(job, row): bamfile = row[0] elements = bamfile.split('.') lane_id = elements[2] sample_id = elements[4] outfile1 = "{}.{}.R1.fastq".format(sample_id, lane_id) outfile2 = "{}.{}.R2.fastq".format(sample_id, lane_id) logfile = "convert_{}.log".format(bamfile) command = ("bedtools bamtofastq", "-i {}".format(bamfile), "-fq {}".format(outfile1), "-fq2 {}".format(outfile2)) job.fileStore.logToMaster("Running command {} and logging to {}\n".format(command, logfile)) pipeline.run_and_log_command(" ".join(command), logfile)
def pisces(job, config, name, input_bam): """Run Pisces on a single sample :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ output_vcf = "{}.pisces.vcf".format(name) logfile = "{}.pisces.log".format(name) command = ["{}".format(config['pisces']['bin']), "-B", "-t", "{}".format(config['pisces']['num_cores']), "-ThreadByChr", "{}".format(input_bam), "-g", "{}".format(config['reference']), "-f", "{}".format(config['min_alt_af']), "-b", "{}".format(config['min_bq']), "-fo", "False", "-q", "{}".format(config['max_var_qscore']), "-c", "{}".format(config['coverage_threshold']), "-s", "{}".format(config['sb_threshold']), "-a", "{}".format(config['min_var_qscore']), "-F", "{}".format(config['var_qscore_threshold']), "-gVCF", "True"] job.fileStore.logToMaster("Pisces Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return output_vcf
def mutect2_single(job, config, name, samples, input_bam): """Run MuTect on an an unmatched tumour sample and call somatic variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ mutect_vcf = "{}.mutect2.vcf".format(name) mutect_logfile = "{}.mutect2.log".format(name) mutect_command = [ "{}".format(config["gatk3.5"]["bin"]), "-T", "MuTect2", "-R", "{}".format(config["reference"]), "--dbsnp", "{}".format(config["dbsnp"]), "--cosmic", "{}".format(config["cosmic"]), "-drf DuplicateRead", "-ip 100", "-L", "{}".format(samples[name]["regions"]), "-nct", "{}".format(config["gatk3.5"]["num_cores"]), "-I:tumor", "{}".format(input_bam), "-o", "{}".format(mutect_vcf), ] job.fileStore.logToMaster("MuTect2 Command: {}\n".format(mutect_command)) pipeline.run_and_log_command(" ".join(mutect_command), mutect_logfile) # job.fileStore.logToMaster("Subset Command: {}\n".format(subset_command)) # pipeline.run_and_log_command(" ".join(subset_command), subset_log) return mutect_vcf
def stringtie(job, config, name, samples, flags): """Perform transcript assembly and quantification with StringTie :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param samples: The samples info and config dictionary. :type samples: dict. :param flags: Flags for extra parameter settings. :type flags: list. :returns: str -- The transcript assembly GTF file name. """ logfile = "{}.stringtie.log".format(name) outfile = "{}.stringtie.gtf".format(name) abundances_file = "{}.gene_abundances.txt".format(name) outdir = "{}_stringtie_final".format(name) working_dir = os.getcwd() full_path_outfile = os.path.join(working_dir, outdir, outfile) command = [ "{}".format(config["stringtie"]["bin"]), "{}".format(samples[name]["bam"]), "-p {}".format(config["stringtie"]["num_cores"]), "-G {}".format(config["merged_transcript_reference"]), "-A {}".format(abundances_file), "-f 0.05", "-m 100", "-B", "-e", "-o {}".format(full_path_outfile), ] command = add_additional_options(command, config, flags) job.fileStore.logToMaster("StringTie Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return outfile
def run_delly2_single(job, config, name, input_bam): """Run delly2 for structural variant detection. As delly2 is parallelized on the level of samples, we use a single-threaded version :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The merged Delly output vcf file name. """ delly_vcfs = list() delly_command_core = ("{}".format(config['delly']['bin']), "-x", "{}".format(config['delly']['exclude']), "-g", "{}".format(config['reference'])) for mut_type in ["DEL", "DUP", "TRA", "INV"]: output_vcf = "{sample}.{type}.vcf".format(sample=name, type=mut_type) logfile = "{sample}.{type}.log".format(sample=name, type=mut_type) delly_vcfs.append(output_vcf) delly_command = list() delly_command.append(delly_command_core) delly_command.append("-t", "{}".format(mut_type), "-o", "{}".format(output_vcf), "{}".format(input_bam)) job.fileStore.logToMaster("Running Delly: {}\n".format(delly_command)) pipeline.run_and_log_command(" ".join(delly_command), logfile) job.fileStore.logToMaster("Merging delly output with command: {}\n".format(merge_command)) pipeline.run_and_log_command(" ".join(merge_command), merge_log) return merged_vcf
def cufflinks(job, config, name, samples): """Transcriptome assembly with cufflinks :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :param input_bam: The input bam file. :type input_bam: str. :returns: str -- The output transcriptome from cufflinks. """ outdir = "{}_cufflinks".format(name) logfile = "{}.cufflinks.log".format(name) working_dir = os.getcwd() path = os.path.join(working_dir, outdir) try: os.mkdir(path) except: sys.stderr.write("Directory {} already exists. Not creating...\n".format(path)) os.chdir(path) command = ["{}".format(config['cufflinks']['bin']), "-g {}".format(config['transcript_reference']), "-b {}".format(config['reference']), "-u", "-p {}".format(config['cufflinks']['num_cores']), "--library-type {}".format(samples[name]['cufflinks_lib']), "{}".format(samples[name]['bam'])] if not os.path.isfile("transcripts.gtf"): job.fileStore.logToMaster("Cufflinks Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) else: job.fileStore.logToMaster("Cufflinks appears to have already executed for {}. Skipping...\n".format(name)) os.chdir(working_dir) return path
def vt_normalization(job, config, sample, caller, input_vcf): """Decompose and left normalize variants :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param sample: caller name. :type sample: str. :param input_vcf: The input_vcf file name to process. :type input_vcf: str. :returns: str -- The output vcf file name. """ output_vcf = "{}.{}.normalized.vcf".format(sample, caller) logfile = "{}.{}.vt_normalization.log".format(sample, caller) normalization = ["zless", "{}".format(input_vcf), "|", "sed", "'s/ID=AD,Number=./ID=AD,Number=R/'", "|", "{}".format(config['vt']['bin']), "decompose", "-s", "-", "|", "{}".format(config['vt']['bin']), "normalize", "-r", "{}".format(config['reference']), "-", ">", "{}".format(output_vcf)] job.fileStore.logToMaster("VT Command: {}\n".format(normalization)) pipeline.run_and_log_command(" ".join(normalization), logfile) return output_vcf
def run_flt3_itdseek(job, config, name): """Run ITDseek without a matched normal sample :param config: The configuration dictionary. :type config: dict. :param name: sample name. :type name: str. :returns: str -- The output vcf file name. """ itdseek_vcf = "{}.flt3.itdseek.vcf".format(name) itdseek_logfile = "{}.flt3.itdseek.log".format(name) itdseek_command = ["{}".format(config['itdseek']['bin']), "{}.rg.sorted.bam".format(name), "{}".format(config['reference']), "{}".format(config['samtools-0.19']['bin']), ">", "{}".format(itdseek_vcf)] job.fileStore.logToMaster("ITDSeek Command: {}\n".format(itdseek_command)) pipeline.run_and_log_command(" ".join(itdseek_command), itdseek_logfile) return itdseek_vcf
def run_fastqc(job, config, samples): """Run FastQC on provided FastQ files :param config: The configuration dictionary. :type config: dict. :param samples: Samples dictionary :type samples: str. """ job.fileStore.logToMaster("Running FastQC for all samples\n") logfile = "fastqc.log" fastq_files_list = list() for sample in samples: fastq_files_list.append(samples[sample]['fastq1']) fastq_files_list.append(samples[sample]['fastq2']) fastq_files_string = " ".join(fastq_files_list) command = ["{}".format(config['fastqc']['bin']), "{}".format(fastq_files_string), "--extract"] job.fileStore.logToMaster("FastQC Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile)
def joint_variant_calling(job, config, name, samples): """Create a cohort VCF file based on joint calling from gVCF files :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param samples: samples configuration dictionary :type samples: dict :param input_bam: The input_bam file name to process. :type input_bam: str. :returns: str -- The output vcf file name. """ vcf = "{}.haplotypecaller.vcf".format(name) logfile = "{}.haplotypecaller_gvcf.log".format(name) gvcfs = list() for sample in samples: gvcfs.append("--variant {}.haplotypecaller.g.vcf".format(sample)) gvcf_string = " ".join(gvcfs) command = ["{}".format(config['gatk-jointgenotyper']['bin']), "-T", "GenotypeGVCFs", "-R", "{}".format(config['reference']), "{}".format(gvcf_string), "-nt", "{}".format(config['gatk-jointgenotyper']['num_cores']), "-o", "{}".format(vcf)] job.fileStore.logToMaster("GenotypeVCFs Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) return vcf
def run_fundi(job, root_name): """Take the specified VCF and use vcfanno to add additional annotations :param config: The configuration dictionary. :type config: dict. :param sample: sample name. :type sample: str. :param input_vcf: The input_vcf file name to process. :type input_vcf: str. :returns: str -- The output vcf file name. """ logfile = "{}.fundi.log".format(root_name) command = ["perl ./FunDi.pl", "-a", "{}.aa_modified_nodash.phy".format(root_name), "-o", "{}.aa_modified_nodash_subtree".format(root_name), "-m LG+F+G", "-s", "{}.nh.def".format(root_name), "-P iqtree", "-r 4", "-t", "{}.nh.newick".format(root_name), "-N 22"] mv_fundi_log = "mv FunDi.log {}_FunDi.log".format(root_name) job.fileStore.logToMaster("FunDi Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) job.fileStore.logToMaster("Rename file Command: {}\n".format(command)) pipeline.run_and_log_command(mv_fundi_log, logfile) return logfile
def subsample_bam(job, addresses, keyspace, auth, name, samples, config, seed, fraction, iteration): """Use samtools view to subsample an input file to the specified fraction""" library_name = "subsample-{}-{}-{}".format(samples[name]['library_name'], fraction, iteration) sublog = "subsample-{}-{}-{}.log".format(name, fraction, iteration) input_bam = "{}.recalibrated.sorted.bam".format( samples[name]['library_name']) subsampled_bam = "subsample-{}-{}-{}.bam".format( samples[name]['library_name'], fraction, iteration) samcommand = "samtools view -s {seed}.{fraction} -b {input} > {output}".format( seed=seed, fraction=fraction, input=input_bam, output=subsampled_bam) index_command = "samtools index {}".format(subsampled_bam) index_log = "{}.index.log".format(subsampled_bam) output = "{}.sambamba_coverage.bed".format(subsampled_bam) logfile = "{}.sambamba_coverage.log".format(subsampled_bam) command = ("{}".format(config['sambamba']['bin']), "depth region", "-L", "{}".format(samples[name]['regions']), "-t", "{}".format(config['sambamba']['num_cores']), "-T", "{}".format(config['coverage_threshold']), "-T", "{}".format(config['coverage_threshold2']), "{}".format(subsampled_bam), ">", "{}".format(output)) job.fileStore.logToMaster("Samtools ViewCommand: {}\n".format(samcommand)) pipeline.run_and_log_command(samcommand, sublog) job.fileStore.logToMaster( "Samtools Index Command: {}\n".format(index_command)) pipeline.run_and_log_command(index_command, index_log) job.fileStore.logToMaster( "SamBamba Coverage Command: {}\n".format(command)) pipeline.run_and_log_command(" ".join(command), logfile) connection.setup(addresses, keyspace, auth_provider=auth) job.fileStore.logToMaster("Adding coverage data: {}\n".format(samcommand)) num_libs = (float(samples[name]['num_libraries_in_run']) * (1 / (float(fraction) / 100.00))) with open(output, 'rb') as coverage: reader = csv.reader(coverage, delimiter='\t') header = reader.next() threshold_indices = list() thresholds = list() index = 0 for element in header: if element.startswith("percentage"): threshold = element.replace('percentage', '') threshold_indices.append(index) thresholds.append(int(threshold)) index += 1 for row in reader: threshold_data = defaultdict(float) index = 0 for threshold in thresholds: threshold_data[threshold] = row[threshold_indices[index]] index += 1 sample_data = SampleCoverage.create( sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], amplicon=row[3], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data) amplicon_data = AmpliconCoverage.create( amplicon=row[3], sample=samples[name]['sample_name'], library_name=library_name, run_id="subsample-{}".format(fraction), num_libraries_in_run=num_libs, sequencer_id=samples[name]['sequencer'], program_name="sambamba", extraction=samples[name]['extraction'], panel=samples[name]['panel'], target_pool=samples[name]['target_pool'], num_reads=row[4], mean_coverage=row[5], thresholds=thresholds, perc_bp_cov_at_thresholds=threshold_data)