def selectvar(rawvcf, selectvcf, filtervcf, genome, run=True): """ This function selects SNPs from a VCF file which is usually the output file of HaplotypeCaller. Then, all SNPs are filtered by certain criteria based on INFO and/or FORMAT annotations. Criteria used here is "QD < 2.0 || FS > 60.0 || MQ < 40.0". More details about SelectVariants_ and VariantFiltration_ .. _SelectVariants: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php .. _VariantFiltration: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php Usage: :: baseq-SNV run_selectvar -r Test.raw.indel.snp.vcf -s Test.raw.snp.vcf -f Test.filtered.snp.vcf -g hg38 Return: :: Test.raw.snp.vcf Test.filtered.snp.vcf """ GATK = get_config("SNV", "GATK") index = get_config("SNV_ref_" + genome, "bwa_index") selectvar_cmd = selectvar_cmd_script.format(gatk=GATK, index=index, rawvcf=rawvcf, selectvcf=selectvcf, filtervcf=filtervcf) if run: run_cmd("SelectVariants", "".join(selectvar_cmd)) return selectvar_cmd
def run_callvar(bqsrbam, rawvcf, genome, disable_dup_filter=False): """ Call germline SNPs and indels via local re-assembly of haplotypes. BAM file recalbrated by BQSR do recommand as input BAM file and this functin only run the single sample genotypeVCF calling. More details see also HaplotypeCaller_ .. _HaplotypeCaller: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php Usage: :: baseq-SNV run_callvar -q Test.marked.bqsr.bam -r Test.raw.indel.snp.vcf -g hg38 Return: :: Test.raw.indel.snp.vcf """ GATK = get_config("SNV", "GATK") index = get_config("SNV_ref_" + genome, "bwa_index") interval = get_config("SNV_ref_" + genome, "interval") extra = "" if disable_dup_filter: extra = "--disable-read-filter NotDuplicateReadFilter" callvar_cmd = callvar_cmd_script.format(gatk=GATK, index=index, interval=interval, bqsrbam=bqsrbam, rawvcf=rawvcf, extrainfos=extra) run_cmd("call variants", "".join(callvar_cmd)) return callvar_cmd
def bowtie2_sort(fq1, fq2, bamfile, genome, reads=5*1000*1000, thread=8): bowtie2 = get_config("CNV", "bowtie2") bowtie2_ref = genome samtools = get_config("CNV", "samtools") samfile = bamfile+".sam" bamfile = bamfile statsfile = bamfile+".stat" print("[info] Bamfile Path : {}".format(bamfile)) #Run Bowtie if fq1 and fq2: bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-1', fq1, '-2', fq2, '>', samfile] else: bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-U', fq1, '>', samfile] run_cmd("bowtie alignment", " ".join(bowtie_cmd)) #run Samtools samtools_sort = [samtools, 'sort -@ ', str(thread), '-o', bamfile, samfile, ";", samtools, "index", bamfile, "; rm", samfile] run_cmd("samtools sort", " ".join(samtools_sort)) #run flagstats cmd_stats = [samtools, "flagstat", bamfile, ">", statsfile] run_cmd("samtools stats", " ".join(cmd_stats)) return bamfile
def run_markdup(bamfile, markedbam): """ Run MarkDuplicates of Picard. this function tags duplicate reads with "markduplicate" in BAM file. See also MarkDuplicates_ in GATK. .. _MarkDuplicates: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php Usage: :: baseq-SNV run_markdup -b Test.bam -m Test.marked.bam Return: metrics file indicates the numbers of duplicates for both single- and paired-end reads :: Test.marked.bam Test.marked.bam.bai Test.marked.bam.metrics """ java = get_config("SNV", "java") picard = get_config("SNV", "picard") samtools = get_config("RNA", "samtools") cmd = markdup_cmd_script.format(java=java, picard=picard, samtools=samtools, markedbam=markedbam, bamfile=bamfile) run_cmd("Mark duplicates", "".join(cmd)) return cmd
def run_cufflinks(genome, method): cufflinks = get_config("RNA", "cufflinks") cufflinks_anno = get_config("RNA_ref_" + genome, "cufflinks_anno") if method == "star": cufflinks_cmd = "{0} -q -o ./ -p 8 -G {1} Aligned.sortedByCoord.out.bam".format( cufflinks, cufflinks_anno) if method == "hisat": cufflinks_cmd = "{0} -q -o ./ -p 8 -G {1} hisat2_sorted.bam".format( cufflinks, cufflinks_anno) return cufflinks_cmd
def run_annovar(filtervcf, annovarfile, name, genome, run=True): annovar = get_config("Annovar", "annovar") if genome == "hg38": ref = get_config("Annovar", "annovar_db_hg38") elif genome in ["hg37", "hg19"]: ref = get_config("Annovar", "annovar_db_hg19") genome = "hg19" annovar_cmd = annovar_cmd_script.format(annovar=annovar, filtervcf=filtervcf, annovarfile=annovarfile, ref_annovar=ref, name=name, genome=genome) if run: run_cmd("convert vcf file to aninput format", "".join(annovar_cmd)) return annovar_cmd
def deseq2(config, tpmfile, countfile, groupfile, comparefile, outpath): """ Run DNACopy.R file ... input: tmp file count file group file: tell the group name for each samplename/groups/ compare file: which groups should be compared... compare_name/group1/group2 output path output: under the output path, for each """ print(config, "XXXXXX") if config: df_cfg = pd.read_excel(config, sheet_name=["sample", "compare"]) print(df_cfg["sample"]) print(df_cfg["compare"]) #write the sample file and group compare file ... Rscript = get_config("RNA", "deseq") script = os.path.join(r_script_dir, "DESeq2.R") cmd = "{} {} {} {} {} {} {}".format(Rscript, script, tpmfile, countfile, groupfile, comparefile, outpath) if not os.path.exists(outpath): os.mkdir(outpath) print("[info] Create OutDir {}".format(outpath)) try: run_cmd("DESeq2", cmd) except: sys.exit("[error] Failed to run the Normalize Rscript ...")
def run_createsomatic_pon(path_pon, path): gatk = get_config("SNV", "GATK") normalargs = listofvcf(path_pon) ponvcf = os.path.join(path, "pon.vcf.gz") pon_cmd = pon_cmd_script.format(gatk=gatk, normalvcfs=normalargs, ponvcf=ponvcf) run_cmd("create panel of normals", "".join(pon_cmd))
def alignment(fq1, fq2, sample, genome, thread=8): """ Map low-divergent sequences against reference genome using BWA. Add ReadGroup(more details about ReadGroup_ )to bamfile using the input sample name. Outfile is in BAM format and indexed for the downstream analysis. .. _ReadGroup: https://software.broadinstitute.org/gatk/documentation/article.php?id=6472 Usage: :: baseq-SNV run_bwa -1 Reads.1.fq.gz -2 Read.2.fq.gz -g hg38 -n Test Return: :: Test.bam Test.bam.bai """ bwa = get_config("SNV", "bwa") samtools = get_config("SNV", "samtools") genome = get_config("SNV_ref_" + genome, "bwa_index") viewedbam = sample + ".view.bam" samfile = sample + ".sam" if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2): bwa_cmd = bwa_cmd_script_p.format(bwa=bwa, sample=sample, genome=genome, fq1=fq1, fq2=fq2, samfile=samfile, thread=thread) elif fq1 and os.path.exists(fq1): bwa_cmd = bwa_cmd_script_s.format(bwa=bwa, sample=sample, genome=genome, fq1=fq1, samfile=samfile, thread=thread) sort_index_cmd = sort_index_cmd_script.format(samtools=samtools, sample=sample, samfile=samfile, viewedbam=viewedbam) run_cmd("bwa alignment", "".join(bwa_cmd)) run_cmd("samtools sort", "".join(sort_index_cmd)) return bwa_cmd + "\n" + sort_index_cmd
def filter_mutect_vcf(somaticvcf, calcontam_table, filter_call): gatk = get_config("SNV", "GATK") filtercall_cmd = filtercall_cmd_script.format( gatk=gatk, somaticvcf=somaticvcf, calcontam_table=calcontam_table, filter_call=filter_call) run_cmd("filter mutect calls using contamination table", "".join(filtercall_cmd))
def run_salmon(fq1, fq2, genome, outdir): salmon = get_config("RNA", "salmon") salmon_ref = get_config("RNA_ref_" + genome, "salmon_index") gene_map = get_config("RNA_ref_" + genome, "gene_map") if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2): salmon_cmd = [ salmon, 'quant', '-i', salmon_ref, '-l A', '-1', fq1, '-2', fq2, '-p 8', '-g', gene_map, '-o', outdir ] elif fq1 and os.path.exists(fq1): salmon_cmd = [ salmon, 'quant', '-i', salmon_ref, '-l A', '-r', fq1, '-p 8', '-g', gene_map, '-o', outdir ] else: sys.exit("[error]") run_cmd("Salmon Quantification", " ".join(salmon_cmd)) return salmon_cmd
def get_filter_table(tumorbam, resource, gps_table, calcontam_table): gatk = get_config("SNV", "GATK") filtertable_cmd = filtertable_cmd_script.format( gatk=gatk, tumorbam=tumorbam, resource=resource, gps_table=gps_table, calcontam_table=calcontam_table) run_cmd("obatin filter table for mutect calls", "".join(filtertable_cmd))
def run_star(fq1, fq2, genome, outdir, run=True): star = get_config("RNA", "star") star_index = get_config("RNA_ref_" + genome, "star_index") samtools = get_config("RNA", "samtools") # Run hisat, samtools and cufflinks if not os.path.exists(outdir): os.mkdir(outdir) print("[info] Create outdir in: {}".format(outdir)) if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2): star_cmd = script.format(outdir, star, star_index, fq1, fq2,samtools) elif fq1 and os.path.exists(fq1): star_cmd = script1.format(outdir, star, star_index, fq1,samtools) else: pass cufflinks_cmd = run_cufflinks(genome, method="star") if run: run_cmd("star analysis", "".join(star_cmd)) run_cmd("cufflinks analysis", "".join(cufflinks_cmd)) return star_cmd + "\n" + cufflinks_cmd
def bqsr(markedbam, bqsrbam, genome, disable_dup_filter=False): """ Run BQSR_. This function performs the two-steps process called base quality score recalibration. the first ster generates a recalibration table based on various covariates which is recruited to the second step to correct the systematic bias in input BAM file. More details about BaseRecalibrator_ and ApplyBQSR_ . .. _BQSR: https://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr .. _BaseRecalibrator: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php .. _ApplyBQSR: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php Usage: * Default mode filters duplicate reads (reads with "markduplicate" tags) before applying BQSR :: baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam * Disable reads filter before analysis. :: baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam -f Yes Return: :: Test.marked.bam.table Test.marked.bqsr.bai Test.marked.bqsr.bam """ gatk = get_config("SNV", "GATK") index = get_config("SNV_ref_" + genome, "bwa_index") DBSNP = get_config("SNV_ref_" + genome, "DBSNP") SNP = get_config("SNV_ref_" + genome, "SNP") INDEL = get_config("SNV_ref_" + genome, "INDEL") interval = get_config("SNV_ref_" + genome, "interval") if not disable_dup_filter: bqsr_cmd = bqsr_cmd_script.format(gatk=gatk, index=index, interval=interval, markedbam=markedbam, bqsrbam=bqsrbam, dbsnp=DBSNP, snp=SNP, indel=INDEL) else: bqsr_cmd = bqsr_cmd_script_DRF.format(gatk=gatk, index=index, interval=interval, markedbam=markedbam, bqsrbam=bqsrbam, dbsnp=DBSNP, snp=SNP, indel=INDEL) run_cmd("BaseRecalibrator", "".join(bqsr_cmd))
def bin_counting(genome, bamfile, out): """ bin counting using bisect for dynamicbin... """ import bisect dynamic_bin = get_config("CNV_ref_" + genome, "dynamic_bin") df_bin = pd.read_table(dynamic_bin, sep="\t") chrs = df_bin["chr"].tolist() start = df_bin["start"].tolist() end = df_bin["end"].tolist() #Build region position for each chromosome... chr_bin_starts = {} chr_bin_ends = {} chr_bin_idx = {} for idx, chr in enumerate(chrs): if chr not in chr_bin_starts: chr_bin_starts[chr] = [start[idx]] chr_bin_ends[chr] = [end[idx]] chr_bin_idx[chr] = [idx] else: chr_bin_starts[chr].append(start[idx]) chr_bin_ends[chr].append(end[idx]) chr_bin_idx[chr].append(idx) lines = len(chrs) counts = [0] * lines #Read bam file reading = Popen(["samtools", "view", bamfile], stdout=PIPE, bufsize=1000000) infile = reading.stdout while True: data = infile.readline().decode('utf8') if data == "": break infos = data.split() #Filter on quality quality = int(infos[4]) if quality < 40: continue #Counting reads to bins chr = infos[2] pos = int(infos[3]) if not chr in chr_bin_starts: continue idx_chr = bisect.bisect_right(chr_bin_starts[chr], pos) if idx_chr < len( chr_bin_ends[chr]) and chr_bin_ends[chr][idx_chr] >= pos: idx = chr_bin_idx[chr][idx_chr] counts[idx] += 1 df_bin["counts"] = counts print("[info] Bin Count File: {}".format(out)) df_bin['counts'].to_csv(out, header=True, sep="\t")
def create_pon(genome, list, path, interval): """ Create_pon function helps you create PoN(panel of normals) file necessary for mutect2 calling. The PoN captures common artifactual and germline variant sites. Mutect2 then uses the PoN to filter variants at the site-level. Example of samples list (tab delimited): * Content of columns are normal sample name, normal BAM file, tumor sample name, tumor BAM file(order cannot be distruped). Absoulte path of all BAM files should be added if directory of BAM files and analysis directory are different. :: N504 N504_marked_bqsr.bam T504 T504_marked_bqsr.bam N505 N505_marked_bqsr.bam T505 T505_marked_bqsr.bam N506 N506_marked_bqsr.bam T506 T506_marked_bqsr.bam N509 N509_marked_bqsr.bam T509 T509_marked_bqsr.bam N510 N510_marked_bqsr.bam T510 T510_marked_bqsr.bam Usage: * Interval list defines genomic regions where analysis is restricted. Introduction of interval list format and its function, please see here_. :: # designated a intervals.list baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./ -L interval.list # Using the dafalut intervals.list baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./ .. _here: https://software.broadinstitute.org/gatk/documentation/article?id=11009 """ index = get_config("SNV_ref_" + genome, "bwa_index") gatk = get_config("SNV", "GATK") if not os.path.exists(path): print("[ERROR] No such file or directory") else: path_pon = os.path.join(path, "pon") if not os.path.exists(path_pon): os.mkdir(path_pon) with open(list, "r") as file: lines = file.readlines() sample_info = [line.strip().split() for line in lines] import multiprocessing as mp pool = mp.Pool(processes=6) results = [] for sample in sample_info: normalvcf = os.path.join(path_pon, "{}_tumor-only.vcf.gz".format(sample[0])) if interval: normalvcf_cmd = normalvcf_cmd_script.format(gatk=gatk, index=index, normalbam=sample[1], samplename=sample[0], normalvcf=normalvcf, interval=interval) else: normalvcf_cmd = normalvcf_cmd_script1.format(gatk=gatk, index=index, normalbam=sample[1], samplename=sample[0], normalvcf=normalvcf) results.append( pool.apply_async( run_cmd, ("creat normal vcf file", "".join(normalvcf_cmd, )))) pool.close() pool.join() [x.get() for x in results] normalargs = listofvcf(path_pon) ponvcf = os.path.join(path, "pon.vcf.gz") pon_cmd = pon_cmd_script.format(gatk=gatk, normalvcfs=normalargs, ponvcf=ponvcf) run_cmd("create panel of normals", "".join(pon_cmd))
def mutect2(genome, normalname, normalbam, tumorname, tumorbam, vcffile, pon, germline): """ Mutect2 is aim to call somatic SNVs and indels via local assembly of haplotypes. This function requires both tumor BAM file and its matched normal BAM file. tumorname and normalname should be consistent with the ReadGroup(ID) of tumor BAM file and normal BAM file respectively. PoN is refer to panel of normals callset(more infomation about PoN and how to create it, please see PoN_ ). Germline resource, also in VCF format, is used to annotate variant alleles. Default germline resource is downloaded from here_ . .. _here: https://software.broadinstitute.org/gatk/download/bundle .. _PoN: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php Usage: * Simplified Mutect2 command line :: # single sample baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\ -t tumor -T tumor_marked_bqsr.bam -o ./ # multiple samples baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./ * Specify PoN(panels of normals) VCF file and germline VCF file Default germline VCF file comes form GATK resource bundle and is recruited if germline isn't designated. :: # single sample baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\ -t tumor -T tumor_marked_bqsr.bam -o ./ \\ -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz # multiple samples baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./ \\ -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz """ gatk = get_config("SNV", "GATK") index = get_config("SNV_ref_" + genome, "bwa_index") if pon: if not germline: germline = get_config("SNV_ref_" + genome, "germline") mutect2_cmd = mutect2_cmd_stardand_script.format( gatk=gatk, index=index, normalbam=normalbam, normalname=normalname, tumorbam=tumorbam, tumorname=tumorname, vcffile=vcffile, germline=germline, pon=pon) else: mutect2_cmd = mutect2_cmd_stardand_script.format( gatk=gatk, index=index, normalbam=normalbam, normalname=normalname, tumorbam=tumorbam, tumorname=tumorname, vcffile=vcffile, germline=germline, pon=pon) else: mutect2_cmd = mutect2_cmd_simplified_script.format( gatk=gatk, index=index, normalbam=normalbam, normalname=normalname, tumorbam=tumorbam, tumorname=tumorname, vcffile=vcffile) run_cmd("mutect annlysis", "".join(mutect2_cmd))
def Normalize_GC_py(genome, bincount, outpath): """Normalize the Raw bin counts with GC contents... Run a Rscript named as 'Lowess.R'. Args: genome : Raw Read Counts bincount : Genome GC content file Process: Read Counts are first normalized by mean; Then normalized by GC using Lowess Regression; Output: GC_content_image: images Normalized bin counts (1M) """ dynamic_bin = get_config("CNV_ref_"+genome, "dynamic_bin") df_counts = pd.read_table(bincount) df = pd.read_table(dynamic_bin) df['counts'] = df_counts['counts'] #First, Aggregate the Bins into 500kb... print("[info] Aggregate the bins into 500kb...") df = df.groupby(df.index // 10).agg({"chr":"first", "start": "mean", "absstart": "mean", "GC": "mean", "length" : "sum", "counts" : "sum"}) #Do the normalization on Length df['norm_counts'] = df['counts']/df['length'] df['norm_counts'] = df['norm_counts']/df['norm_counts'].mean() df['norm_counts_log'] = np.log(df.norm_counts + 0.01) #Do Norm on GC... lowess = sm.nonparametric.lowess z = lowess(df.norm_counts_log, df.GC) f = interp1d(list(zip(*z))[0], list(zip(*z))[1], bounds_error = False) df['norm_by_GC'] = np.exp(df.norm_counts_log - f(df['GC']))-0.01 df['norm_by_GC'] = df['norm_by_GC']/df['norm_by_GC'].mean() #plot GC correction... plt.figure(figsize=(10, 10)) plt.subplot(2, 2, 1) plt.title('Raw Normalized Reads (500Kb)') plt.scatter(df.GC, df.norm_counts, facecolors='none', edgecolors='r') plt.subplot(2, 2, 2) plt.title('GC corrected (500Kb)') plt.scatter(df.GC, df.norm_by_GC, facecolors='none', edgecolors='b') #Peaks Detection plt.subplot(2, 2, 3) plt.title('Peaks') plt.hist(df.norm_by_GC, bins=300, density=True) #Peaks Detection.. Ploidy_Lists = [x/40 for x in range(60, 240, 1)] SoS = [] for ploidy in Ploidy_Lists: errors = round(df['norm_by_GC']*ploidy)-df['norm_by_GC']*ploidy SoS.append(sum([x*x for x in errors])) estimate_ploidy = Ploidy_Lists[SoS.index(min(SoS))] print("[info] The estimated plodity is {}".format(estimate_ploidy)) plt.subplot(2, 2, 4) plt.title('Plodity Estimate: {}'.format(estimate_ploidy)) plt.ylabel("Errors (Should be Minimized)") plt.xlabel("Ploidy") plt.plot(Ploidy_Lists, SoS) df['norm_by_GC_Ploidy'] = df['norm_by_GC']*estimate_ploidy df_export = df[['chr', 'start', 'absstart', 'norm_by_GC', 'norm_by_GC_Ploidy']] print("[info] Write to {}".format(outpath)) df_export.to_csv(outpath, sep="\t", float_format='%.2f') plt.savefig("Normalize.png") print(df)