def mpileup(self, chr_name): cmd1 = 'bcftools mpileup -a AD,ADF,ADR \ -B \ -q {0}\ -Q {1} \ -O u \ -r {2} \ -f {3} \ --ignore-RG \ {4}/30_bam/WGS.*.bam | \ bcftools call -vm \ -f GQ,GP \ -O u | \ bcftools filter -i "INFO/MQ>={0}" \ -O z \ -o {4}/60_vcf/raiden.{5}.vcf.gz \ > {4}/log/bcftools.{5}.log \ 2>&1'.format(self.args.min_MQ, self.args.min_BQ, chr_name, self.args.ref, self.args.out, self.check_chr_name(chr_name)) cmd2 = 'tabix -f \ -p vcf \ {0}/60_vcf/raiden.{1}.vcf.gz \ >> {0}/log/tabix.{1}.log \ 2>&1'.format(self.args.out, self.check_chr_name(chr_name)) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'bcftools_{}'.fomrat(self.check_chr_name(chr_name)), cmd1) sys.exit(1) try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'tabix', cmd2) sys.exit(1)
def trim_RNAseq(self, fastq1, fastq2, index, N_threads): cmd = 'FaQCs -1 {0} \ -2 {1} \ --prefix {2} \ -d {3}/20_fastq/FaQCs_{2} \ -t {4} \ -min_L 50 \ -avg_q 20 \ --polyA \ --adapter \ -discard 1 \ > {3}/log/FaQCs_{2}.log \ 2>&1'.format(fastq1, fastq2, index, self.args.out, N_threads) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'FaQCs_{}'.format(index), cmd) sys.exit(1) fastq1 = "{0}/20_fastq/FaQCs_{1}/{1}.1.trimmed.fastq".format( self.args.out, index) fastq2 = "{0}/20_fastq/FaQCs_{1}/{1}.2.trimmed.fastq".format( self.args.out, index) return fastq1, fastq2
def trim_WGS(self, fastq1, fastq2, index, N_threads): cmd = 'FaQCs -1 {0} \ -2 {1} \ --prefix {2} \ -d {3}/20_fastq/FaQCs_{2} \ -t {4} \ --adapter \ > {3}/log/FaQCs_{2}.log \ 2>&1'.format(fastq1, fastq2, index, self.args.out, N_threads) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'FaQCs_{}'.format(index), cmd) sys.exit(1) prins = Prinseq(self.args, N_threads, index) prins.run() fastq1 = "{0}/20_fastq/prinseq_{1}/{1}_1.fastq".format( self.args.out, index) fastq2 = "{0}/20_fastq/prinseq_{1}/{1}_2.fastq".format( self.args.out, index) return fastq1, fastq2
def align_WGS(self, fastq1, fastq2, index, N_threads): cmd1 = 'hisat2 -1 {0} \ -2 {1} \ -x {2} \ --no-mixed \ --no-discordant \ --no-spliced-alignment \ -k 1 \ -p {3} \ 2> {4}/log/hisat2_{5}.log | \ samtools view -b \ -F 004 | \ samtools sort -@ {3} \ -o {4}/30_bam/{5}.bam'.format( fastq1, fastq2, self.args.ref, N_threads, self.args.out, index) cmd2 = 'samtools index {0}/30_bam/{1}.bam \ 2> {0}/log/samtools_index_{1}.log'.format( self.args.out, index) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'hisat2_{}'.format(index), cmd1) sys.exit(1) try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'samtools_index_{}'.format(index), cmd2) sys.exit(1)
def run(self): print(time_stamp(), 'start to index reference fasta.', flush=True) cmd1 = 'hisat2-build -p {0} {1} {1} \ > {2}/log/hisat2-build.log \ 2>&1'.format(self.args.threads, self.args.ref, self.args.out) cmd2 = 'samtools faidx {} \ > {}/log/samtools_faidx.log \ 2>&1'.format(self.args.ref, self.args.out) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) print(time_stamp(), 'hisat2-build...', flush=True) try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'hisat2-build', cmd1) sys.exit(1) try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'samtools_faidx', cmd2) sys.exit(1) print(time_stamp(), 'indexing of the reference genome successfully finished.', flush=True)
def remove_duplicates(self): cmd = 'cat {0}/candidate_genes_from_*.{1} | \ cut -f 1-9 | \ sort -u > {0}/all_candidate_genes.{1}'.format( self.args.out, self.gff_extension) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd), flush=True) sys.exit(1)
def mkindex(self): cmd = 'tabix -f \ -p vcf \ {0}/60_vcf/raiden.vcf.gz \ >> {0}/log/tabix.log \ 2>&1'.format(self.args.out) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'tabix', cmd) sys.exit(1)
def run_bedtools(self, index): cmd = 'bedtools bamtobed -i {0}/40_bed/{1}.no_error.bam | \ bedtools merge 1> {0}/40_bed/{1}.bed \ 2> {0}/log/bedtools_{1}.log'.format( self.args.out, index) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'bedtools_{}'.format(index), cmd) sys.exit(1)
def check_mut_annotation(self): cmd = 'bedtools intersect -wa \ -a {0}/transcript.{1} \ -b {0}/filtered_markers.bed | \ sort -u 1> {0}/candidate_genes_from_mutations.{1} \ 2> {0}/jiji_mut_bedtools.log'.format( self.args.out, self.gff_extension) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'jiji_mut_bedtools', cmd) sys.exit(1)
def run_bamtools(self, index): cmd = 'bamtools filter -in {0}/30_bam/{1}.bam \ -out {0}/40_bed/{1}.no_error.bam \ -script {0}/log/bamtools.json \ > {0}/log/bamtools_{1}.log \ 2>&1'.format(self.args.out, index) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'bamtools_{}'.format(index), cmd) sys.exit(1)
def make_gff(self): cmd = 'gffread -F \ -o {0}/50_annotation/annotation.gff \ {0}/50_annotation/annotation.gtf \ > {0}/log/gffread_gff.log \ 2>&1'.format(self.args.out) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'gffread_gff', cmd) sys.exit(1)
def run_gffread(self): cmd = 'gffread -g {0} \ -w {1}/50_annotation/annotation.fasta \ {1}/50_annotation/annotation.gff \ > {1}/log/gffread_fasta.log \ 2>&1'.format(self.args.ref, self.args.out) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'gffread_fasta', cmd) sys.exit(1)
def check_PA_coverage(self, index): cmd = 'bedtools coverage -a {0}/transcript.{1} \ -b {2} \ 1> {0}/candidate_genes_from_PA.{3}.bed \ 2> {0}/jiji_PA_bedtools.log'.format(self.args.out, self.gff_extension, self.bed_files[index], index) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'jiji_PA_bedtools', cmd) sys.exit(1)
def seqkit_split2(self): cmd = 'seqkit split2 -p {0} \ -j {0} \ -1 {1}/20_fastq/FaQCs_{2}/{2}.1.trimmed.fastq \ -2 {1}/20_fastq/FaQCs_{2}/{2}.2.trimmed.fastq \ -o {1}/20_fastq/FaQCs_{2} \ > {1}/log/seqkit_{2}.log \ 2>&1'.format(self.N_threads, self.args.out, self.index) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'seqkit_{}'.format(self.index), cmd) sys.exit(1)
def filter_candidates(self): cmd = 'jiji -a {0}/50_annotation/annotation.gff \ -b {0}/40_bed \ -v {0}/60_vcf/raiden.vcf.gz \ -o {0}/70_result'.format(self.args.out) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd), flush=True) sys.exit(1) shutil.move('{0}/70_result/jiji_PA_bedtools.log'.format(self.args.out), '{0}/log/'.format(self.args.out)) shutil.move('{0}/70_result/jiji_mut_bedtools.log'.format(self.args.out), '{0}/log/'.format(self.args.out))
def merge_bam(self): if len(self.args.rna_seq) > 1: cmd = 'samtools merge {0}/50_annotation/RNA-seq.bam \ {0}/30_bam/RNA-seq.*.bam'.format( self.args.out) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'samtools_merge', cmd) sys.exit(1) else: path_to_bam = os.path.abspath('{0}/30_bam/RNA-seq.0000.bam'.format( self.args.out)) os.symlink(path_to_bam, '{0}/50_annotation/RNA-seq.bam'.format(self.args.out))
def transciptome_assembly(self): if self.args.strand == 'None': cmd = 'stringtie -p {0} \ -m {1} \ -o {2}/50_annotation/annotation.gtf \ -l annotation \ -f 0.9 \ {2}/50_annotation/RNA-seq.bam \ > {2}/log/stringtie.log \ 2>&1'.format(self.args.threads, self.args.minimum_len, self.args.out) else: cmd = 'stringtie -p {0} \ -m {1} \ -o {2}/50_annotation/annotation.gtf \ -l annotation \ --{3} \ -f 0.9 \ {2}/50_annotation/RNA-seq.bam \ > {2}/log/stringtie.log \ 2>&1'.format(self.args.threads, self.args.minimum_len, self.args.out, self.args.strand) cmd = clean_cmd(cmd) try: sbp.run(cmd, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'stringtie', cmd) sys.exit(1)
def merge_fastq(self): cmd1 = 'cat {0}/20_fastq/prinseq_{1}/{1}.part_*_1.fastq \ > {0}/20_fastq/prinseq_{1}/{1}_1.fastq'.format( self.args.out, self.index) cmd2 = 'cat {0}/20_fastq/prinseq_{1}/{1}.part_*_2.fastq > \ {0}/20_fastq/prinseq_{1}/{1}_2.fastq'.format( self.args.out, self.index) cmd3 = 'cat {0}/20_fastq/prinseq_{1}/{1}.part_*_1_singletons.fastq \ > {0}/20_fastq/prinseq_{1}/{1}_1_singletons.fastq'.format( self.args.out, self.index) cmd4 = 'cat {0}/20_fastq/prinseq_{1}/{1}.part_*_2_singletons.fastq \ > {0}/20_fastq/prinseq_{1}/{1}_2_singletons.fastq'.format( self.args.out, self.index) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) cmd3 = clean_cmd(cmd3) cmd4 = clean_cmd(cmd4) try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd1), flush=True) sys.exit(1) try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd2), flush=True) sys.exit(1) try: sbp.run(cmd3, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd3), flush=True) sys.exit(1) try: sbp.run(cmd4, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd4), flush=True) sys.exit(1)
def gzip_FaQCs(self): cmd1 = 'pigz -p {0} \ {1}/20_fastq/FaQCs_{2}/{2}.1.trimmed.fastq'.format( self.N_threads, self.args.out, self.index) cmd2 = 'pigz -p {0} \ {1}/20_fastq/FaQCs_{2}/{2}.2.trimmed.fastq'.format( self.N_threads, self.args.out, self.index) cmd3 = 'pigz -p {0} \ {1}/20_fastq/FaQCs_{2}/{2}.discard.trimmed.fastq'.format( self.N_threads, self.args.out, self.index) cmd4 = 'pigz -p {0} \ {1}/20_fastq/FaQCs_{2}/{2}.unpaired.trimmed.fastq'.format( self.N_threads, self.args.out, self.index) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) cmd3 = clean_cmd(cmd3) cmd4 = clean_cmd(cmd4) if os.path.isfile('{0}/20_fastq/FaQCs_{1}/{1}.1.trimmed.fastq'.format( self.args.out, self.index)): try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd1), flush=True) sys.exit(1) if os.path.isfile('{0}/20_fastq/FaQCs_{1}/{1}.2.trimmed.fastq'.format( self.args.out, self.index)): try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd2), flush=True) sys.exit(1) if os.path.isfile( '{0}/20_fastq/FaQCs_{1}/{1}.discard.trimmed.fastq'.format( self.args.out, self.index)): try: sbp.run(cmd3, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd3), flush=True) sys.exit(1) if os.path.isfile( '{0}/20_fastq/FaQCs_{1}/{1}.unpaired.trimmed.fastq'.format( self.args.out, self.index)): try: sbp.run(cmd4, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd4), flush=True) sys.exit(1)
def gzip_prinseq(self): cmd1 = 'pigz -p {0} \ {1}/20_fastq/prinseq_{2}/{2}_1.fastq'.format( self.N_threads, self.args.out, self.index) cmd2 = 'pigz -p {0} \ {1}/20_fastq/prinseq_{2}/{2}_2.fastq'.format( self.N_threads, self.args.out, self.index) cmd3 = 'pigz -p {0} \ {1}/20_fastq/prinseq_{2}/{2}_1_singletons.fastq'.format( self.N_threads, self.args.out, self.index) cmd4 = 'pigz -p {0} \ {1}/20_fastq/prinseq_{2}/{2}_2_singletons.fastq'.format( self.N_threads, self.args.out, self.index) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) cmd3 = clean_cmd(cmd3) cmd4 = clean_cmd(cmd4) try: sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd1), flush=True) sys.exit(1) try: sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd2), flush=True) sys.exit(1) try: sbp.run(cmd3, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd3), flush=True) sys.exit(1) try: sbp.run(cmd4, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: print(time_stamp(), '!!ERROR!! {}\n'.format(cmd4), flush=True) sys.exit(1)
def concat(self): cmd1 = 'cat {0}/log/bcftools.*.log > {0}/log/bcftools.log'.format( self.args.out) cmd2 = 'cat {0}/log/tabix.*.log > {0}/log/tabix.log'.format( self.args.out) cmd3 = 'bcftools concat -a \ -O z \ -o {0}/60_vcf/raiden.vcf.gz \ {0}/60_vcf/raiden.*.vcf.gz \ >> {0}/log/bcftools.log \ 2>&1'.format(self.args.out) cmd4 = 'rm -f {}/60_vcf/raiden.*.vcf.gz'.format(self.args.out) cmd5 = 'rm -f {}/60_vcf/raiden.*.vcf.gz.tbi'.format(self.args.out) cmd6 = 'rm -f {}/log/bcftools.*.log'.format(self.args.out) cmd7 = 'rm -f {}/log/tabix.*.log'.format(self.args.out) cmd1 = clean_cmd(cmd1) cmd2 = clean_cmd(cmd2) cmd3 = clean_cmd(cmd3) cmd4 = clean_cmd(cmd4) cmd5 = clean_cmd(cmd5) cmd6 = clean_cmd(cmd6) cmd7 = clean_cmd(cmd7) sbp.run(cmd1, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) sbp.run(cmd2, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) try: sbp.run(cmd3, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) except sbp.CalledProcessError: call_log(self.args.out, 'bcftools', cmd3) sys.exit(1) sbp.run(cmd4, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) sbp.run(cmd5, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) sbp.run(cmd6, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True) sbp.run(cmd7, stdout=sbp.DEVNULL, stderr=sbp.DEVNULL, shell=True, check=True)