def run(self, infiles, outfile, params): files = " ".join(infiles) job_threads = params.job_threads # todo: # 1. add header. # 2. do batch+merge sort in order to avoid hitting temporary space limits. # 3. remove unnecessary info fields while sorting, add them later. tmpdir = P.get_temp_filename() retval = P.run( "mkdir {tmpdir}; " "bcftools view -h {infiles[0]} " "| cut -f 1-10 " "| bgzip > {outfile}; " "zcat {files} " "| awk -v OFS='\\t' " "'!/^#/ && $5 != \"<NON_REF>\" " "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' " "2> {outfile}.filter.log " "| sort -k1,1V -k2,2n " "--parallel {job_threads} " "-T {tmpdir} " "2> {outfile}.sort.log " "| uniq " "| bgzip " ">> {outfile}; " "tabix -p vcf {outfile}; " "rm -rf {tmpdir} ".format(**locals()))
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile}.shared.bed.gz") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_test.bed.gz") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_truth.bed.gz") statements.append("rm -f {tmpf_test} {tmpf_truth}") for section in self.sections: statements.append( "tabix -p bed {outfile}.{section}.bed.gz".format(**locals())) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) # these are small files, so doing it here. Implement tabix.count() # method counts = dict() for section in self.sections: # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf: inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz") counts[section] = len(list(inf.fetch())) inf.close() with IOTools.open_file(outfile, "w") as outf: outf.write("section\tcounts\n") outf.write("\n".join( ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n") return retval
def run(self, outfile, params): if "--threads" in params.options or "-t " in params.options: job_threads = int(re.search("(-t|--threads)\s*(\d+)", params.options).groups()[1]) fastq = resolve_argument(params.fastq, ",").split(",") if len(fastq) == 1: fastq = '-U "{}"'.format(fastq) else: fastq = '-1 "{}" -2 "{}"'.format(*fastq) tmpdir = P.get_temp_filename(clear=True) if "index" in params._fields: index = params.index else: index = params.reference_fasta if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "--rg-id {}".format(readgroup_id) # add additional level of quoting and remove "ID:{}" readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string) readgroup_string = " ".join(["--rg {}".format(x) for x in readgroup_string.split("\t")]) else: readgroup_option = "" readgroup_string = "" return P.run( "mkdir {tmpdir}; " "{self.path} " "{readgroup_option} " "{readgroup_string} " "{params.options} " "-x {index} " "{fastq} " "2> {outfile}.log " "| samtools view -b /dev/stdin " "2> {outfile}.view.log " "| samtools sort -T {tmpdir} -O bam /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile}; " "rm -rf {tmpdir}".format(**locals()), **params._asdict())
def run(self, infile, outfile, params): if params.annotations_bed is None: raise ValueError("{} requires annotations_bed to be set".format( self.name)) if params.workspace_bed is None: raise ValueError("{} requires workspace_bed to be set".format( self.name)) retval = run_metric_bedtools_intersection.run(self, infile, outfile, params) retvals = [retval] statements = [ "mv {outfile} {outfile}.bedtools_intersect_and_annotate_counts.tsv" .format(**locals()) ] bed_files = [] for section in self.sections: tmpf = P.get_temp_filename(clear=True) + "-" + section + ".gz" statements.append( "zcat {outfile}.{section}.bed.gz " "| awk -v OFS='\\t' '{{ $4 = \"{section}\"; print }}' " "| bgzip > {tmpf}".format(**locals())) bed_files.append(tmpf) segment_files = " ".join( ["--segment-bed-file={}".format(x) for x in bed_files]) statements.append( "{params.gat_path} " "{segment_files} " "--with-segment-tracks " "--annotation-bed-file={params.annotations_bed} " "--workspace-bed-file={params.workspace_bed} " "--log={outfile} " "{params.options} " "> {outfile}.bedtools_intersect_and_annotate_enrichment.tsv". format(**locals())) for f in bed_files: statements.append("rm -f {}".format(f)) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def run(self, outfile, params): # the default is auto so use ten threads. if "threads" in params.options: if "job_threads=auto" in params.options: raise ValueError( "please specify the number of threads " "to use explicitely") else: job_threads = int(re.search("threads=(\d+)", params.options).groups()[0]) else: raise ValueError("please specify the number of threads to use") job_memory = "32G" fastq = resolve_argument(params.fastq, " ") tmpdir = P.get_temp_filename(clear=True) return P.run( "mkdir {tmpdir}; " "zcat {fastq} " "| cut -c -5999 " "| gzip > {tmpdir}/in.fastq.gz; " "{params.path} " "{params.options} " "in={tmpdir}/in.fastq.gz " "ref={params.reference_fasta} " "out={tmpdir}/result.bam " ">& {outfile}.log; " "samtools sort -o {tmpdir}/sorted.bam {tmpdir}/result.bam; " "java -Xmx8000m -jar {params.path_picard} " "AddOrReplaceReadGroups " "INPUT={tmpdir}/sorted.bam " "OUTPUT={outfile} " "VALIDATION_STRINGENCY=LENIENT " "RGID=1 " "RGLB={params.library} " "RGPL={params.platform} " "RGPU=unknown " "RGSM={params.sample} " ">& {outfile}.picard.log; " "samtools index {outfile} " ">& {outfile}.index.log; " "rm -rf {tmpdir}".format(**locals()))
def run(self, outfile, params): if "-t" in params.options: job_threads = int(re.search("-t\s*(\d+)", params.options).groups()[0]) else: job_threads = 1 # BWA requires at least 6Gb of memory, but is also correlated # with the number of threads, so use 5Gb + 1Gb per thread job_memory = "{}G".format(5.0 + 1.0 * job_threads) fastq = resolve_argument(params.fastq, ",") fastq = '"{}"'.format('" "'.join(fastq.split(","))) tmpdir = P.get_temp_filename(clear=True) if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "-R {}".format(pipes.quote(readgroup_string)) # add additional level of quoting: readgroup_option = re.sub("\\t", "\\\\t", readgroup_option) else: readgroup_option = "" return P.run( "mkdir {tmpdir}; " "{self.path} mem " "{readgroup_option} " "{params.options} " "{params.reference_fasta} " "{fastq} " "2> {outfile}.log " "| samtools view -bu /dev/stdin " "2> {outfile}.view.log " "| samtools sort --threads {job_threads} -T {tmpdir} -O bam /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile} >& {outfile}.index.log; " "rm -rf {tmpdir}".format(**locals()), **params._asdict())
def run(self, infile, outfile, params): tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.annotations_bed) statements = [stmnt] statements.append("{params.path} " "--segment-bed-file={tmpf_test} " "--ignore-segment-tracks " "--annotation-bed-file={tmpf_truth} " "--workspace-bed-file={params.workspace_bed} " "--log={outfile}.log " "{params.options} " "> {outfile}") statement = "; ".join(statements) return P.run(statement.format(**locals()))
def run(self, infile, outfiles, params): # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) outfile_shared, outfile_test, outfile_truth = outfiles tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile_shared} ") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile_test}") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile_truth}") statements.append("rm -f {tmpf_test} {tmpf_truth}") for f in outfiles: statements.append("tabix -f -p bed {}".format(f)) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) return retval
def run(self, outfile, params): if "--nCPU" in params.options: job_threads = int( re.search("--nCPU\s*(\d+)", params.options).groups()[0]) bam = resolve_argument(params.bam) reference_fasta = get_reference(params) tmpfile = P.get_temp_filename(clear=True) return P.run("{params.path} callVariants " "--bamFiles {bam} " "--refFile {reference_fasta} " "--output {tmpfile} " "{params.options} " ">& {outfile}.log; " "bgzip {tmpfile}; " "tabix -p vcf {tmpfile}.gz; " "mv {tmpfile}.gz {outfile}; " "mv {tmpfile}.gz.tbi {outfile}.tbi; ".format(**locals()))
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # jaccard requires a consistent sort order, so sort both # bed files: tmpf = P.get_temp_filename(clear=True) tmpf1, tmpf2 = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf1, tmpf2, infile, params.reference_bed) retval = P.run("{stmnt}; " "{params.path} jaccard " "-a {tmpf1} -b {tmpf2} " "{params.options} " "2> {outfile}.log " ">> {outfile}; " "rm -f {tmpf1} {tmpf2}".format(**locals())) return retval
def run(self, outfile, params): if "-t" in params.options: job_threads = int(re.search("-t\s*(\d+)", params.options).groups()[0]) job_memory = "32G" fastq = resolve_argument(params.fastq, " ") tmpdir = P.get_temp_filename(clear=True) return P.run( "mkdir {tmpdir}; " "{params.path} " "{params.options} " "-r {params.reference_fasta} " "-d {fastq} " "-o {tmpdir}/result.sam " ">& {outfile}.log; " "samtools view -bS {tmpdir}/result.sam " "| samtools sort -o {tmpdir}/sorted.bam -; " "java -Xmx8000m -jar {params.path_picard} " "AddOrReplaceReadGroups " "INPUT={tmpdir}/sorted.bam " "OUTPUT={outfile} " "VALIDATION_STRINGENCY=LENIENT " "RGID=1 " "RGLB={params.library} " "RGPL={params.platform} " "RGPU=unknown " "RGSM={params.sample} " ">& {outfile}.picard.log; " "samtools index {outfile} " ">& {outfile}.index.log; " "rm -rf {tmpdir}".format(**locals()))
def run(self, outfile, params): try: vcf_target = params.vcf["target"] test_fp = params.vcf["test"]["fp"] test_fn = params.vcf["test"]["fn"] test_tp = params.vcf["test"]["tp"] comp_fp = params.vcf["compare"]["fp"] comp_fn = params.vcf["compare"]["fn"] comp_tp = params.vcf["compare"]["tp"] except KeyError as msg: raise ValueError("missing input data: {}".format(msg)) tmpdir = P.get_temp_filename(clear=True) outdir = os.path.dirname(outfile) bedfile = os.path.join(tmpdir, "annotations.bed.gz") bedfile_sorted = os.path.join(outdir, "annotations.bed.gz") header = os.path.join(outdir, "header.txt") with open(header, "w") as outf: outf.write( '##INFO=<ID=AS,Number=.,Type=String,' 'Description="Assessment code. Combination of FP/FN/TP and ' 'U for unique, O for other and S for shared.">') statements = ["mkdir {tmpdir}".format(**locals())] toprocess = [] for a, b, label in zip((test_fp, test_fn, test_tp), (comp_fp, comp_fn, comp_tp), ("FP", "FN", "TP")): statements.append( "{params.path} isec " "--output-type z " "--prefix {tmpdir}/{label} " "{a} {b}" "&> {outfile}.isec_{label}.log ".format(**locals())) toprocess.append((os.path.join(tmpdir, label, "0000.vcf.gz"), label + "U")) toprocess.append((os.path.join(tmpdir, label, "0001.vcf.gz"), label + "O")) toprocess.append((os.path.join(tmpdir, label, "0002.vcf.gz"), label + "S")) # TPO = FNU # TPU = FNO toprocess = [x for x in toprocess if x[1] not in ["TPO", "TPU"]] # files to keep, these are variants that will be not in the vcf # file that is being annotated. keep = ["FNS", "FNU", "FNO"] for f, label in toprocess: statements.append( "zcat {f} " "| awk '!/^#/ " "{{printf(\"%%s\\t%%i\\t%%i\\t{label}\\n\", $1, $2-1, $2) }}'" "| bgzip " ">> {bedfile} ".format(**locals())) if label in keep: statements.append( "cp {f} {outfile}.{label}.vcf.gz".format(**locals())) statements.append( "zcat {bedfile} " "| sort -k1,1 -k2,2n " "| bedtools merge -i stdin -c 4 -o distinct -delim ',' " "2> {bedfile_sorted}.log " "| bgzip " "> {bedfile_sorted}".format(**locals())) statements.append("tabix -p bed {bedfile_sorted}".format(**locals())) statements.append("bcftools annotate " "--annotations={bedfile_sorted} " "--columns=CHROM,FROM,TO,AS " "--header-lines {header} " "--output-type z " "{vcf_target} " "2> {outfile}.log " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals())) statements.append("rm -rf {tmpdir}".format(**locals())) statement = "; ".join(statements) return self.run_with_preprocessing(vcf_target, outfile, params, statement)
def run(self, outfile, params): min_job_memory = 3 if "-t" in params.options: job_threads = int(re.search("-t\s*(\d+)", params.options).groups()[0]) else: job_threads = 1 job_memory = "{}G".format( float(min_job_memory + 1.0 * job_threads) / job_threads) cram_fasta = params.cram_fasta if params.cram_fasta is None: cram_fasta = params.reference_fasta if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "-R {}".format(pipes.quote(readgroup_string)) # add additional level of quoting: readgroup_option = re.sub("\\t", "\\\\t", readgroup_option) else: readgroup_option = "" fastq = " ".join(sra_peek(params.sra)) outfile = os.path.abspath(outfile) if params.extract_to_temp: tmpdir = P.get_temp_filename(clear=True) tmpdir_pre = "mkdir {};".format(tmpdir) tmpdir_post = "rm -rf {}".format(tmpdir) else: tmpdir = os.path.dirname(outfile) tmpdir_pre = "" tmpdir_post = "" # AH: fastq-dump hangs with arv mounts, thus try copying first if not IOTools.is_local(params.sra): E.warn("copying file {} to temporary directory".format(params.sra)) temp_sra = os.path.join( tmpdir, os.path.basename(params.sra)) fastq_dump = ( "cp {params.sra}* {tmpdir}; " "fastq-dump --split-files --gzip {temp_sra} >& {outfile}.dump.log ".format( **locals())) tmpdir_post = "rm -f {}*; {}".format( temp_sra, tmpdir_post) else: fastq_dump = ( "fastq-dump --split-files --gzip {params.sra} >& {outfile}.dump.log " ) return P.run( "{tmpdir_pre} " "cd {tmpdir}; " "{fastq_dump}; " "{self.path} mem -v 3 " "{readgroup_option} " "{params.options} " "{params.reference_fasta} " "{fastq} " "2> {outfile}.map.log " "| samtools view -O cram --reference {params.cram_fasta} /dev/stdin " "2> {outfile}.view.log " "| samtools sort -T {tmpdir} -O cram /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile} >& {outfile}.index.log; " "{tmpdir_post}".format(**locals()))