def genotype_call(reference, vcf, output_file="./combined.vcf", ploidy=2): wd = os.path.dirname(os.path.abspath(output_file)) + "/" reference = os.path.abspath(reference) vcf = os.path.abspath(vcf) mkdir(wd) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(reference), f'{reference} does not exist' assert os.path.exists(vcf), f'{vcf} does not exist' e(f"""gatk GenotypeGVCFs \ -R "{reference}" -ploidy {ploidy} \ -V "{vcf}" \ -O "{output_file}" """) return
def variant_call(wd, reference, alignment, ploidy=2): wd = os.path.abspath(wd) + "/" reference = os.path.abspath(reference) alignment = os.path.abspath(alignment) mkdir(wd) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(reference), f'{reference} does not exist' assert os.path.exists(alignment), f'{alignment} does not exist' e(f"""gatk HaplotypeCaller -ERC GVCF \ -R "{reference}" -ploidy {ploidy} \ -I "{alignment}" --output-mode EMIT_ALL_CONFIDENT_SITES \ -O "{wd}raw.g.vcf.gz" """) e(f"""gatk GenotypeGVCFs \ -R "{reference}" -ploidy {ploidy} \ -V "{wd}raw.g.vcf.gz" \ -O "{wd}output.vcf.gz" """)
def init_ref(path): assert "PICARD" in os.environ, "PICARD environment variable not configured" filename = os.path.basename(path) workdir = os.path.dirname(path) e("cd {workdir};bwa index -a is {record_name}", record_name=filename, workdir=workdir) e("cd {workdir};java -jar $PICARD CreateSequenceDictionary R={record_name} O={record_name}.dict", record_name=filename, workdir=workdir) e("cd {workdir};samtools faidx {record_name}", record_name=filename, workdir=workdir) # e("bowtie2-build {record_name} {record_name}".format(record_name=filename)) e("cd {workdir};makeblastdb -dbtype nucl -in {record_name} ", record_name=filename, workdir=workdir)
def clean_reads(work_dir, read1, read2, trim_left=20, trim_qual_right=25, trim_qual_window=25, min_len=35, window_size=5, cpu=1, clip_string=""): """ :param work_dir: :param read1: :param read2: :param min_qual_mean: :param trim_left: :param trim_qual_right: :param trim_qual_window: :param min_len: :param window_size: :param cpu: :param clip_string: example -> ILLUMINACLIP:TruSeq3-SE:2:30:10 :return: """ work_dir = os.path.abspath(work_dir) + "/" read1 = os.path.abspath(read1) read2 = os.path.abspath(read2) # Quality control # "prinseq-lite.pl -fastq {read1_full} -fastq2 {read2_full} -min_qual_mean {min_qual_mean}" + # " -trim_left {trim_left} -trim_qual_right {trim_qual_right} -trim_qual_window {trim_qual_window}" + # " -min_len {min_len} -out_good trimmed", e('java -jar $TRIMMOMATIC PE -threads {cpu} "{read1_full}" "{read2_full}" ' + ' {pout1} {upout1} {pout2} {upout2} ' + ' HEADCROP:{trim_left} TRAILING:{trim_qual_right} SLIDINGWINDOW:{window_size}:{trim_qual_window} ' + ' {clip_string} MINLEN:{min_len} ', work_dir, read1_full=read1, read2_full=read2, trim_left=trim_left, trim_qual_right=trim_qual_right, trim_qual_window=trim_qual_window, min_len=min_len, cpu=cpu, pout1="trimmed_1.fastq", pout2="trimmed_2.fastq", upout1="trimmed_1_singletons.fastq", upout2="trimmed_2_singletons.fastq", window_size=window_size, clip_string=clip_string) e("fastqc trimmed_1.fastq", work_dir) e("fastqc trimmed_2.fastq", work_dir) if os.path.exists(work_dir + "trimmed_1_singletons.fastq"): e("cat trimmed_1_singletons.fastq >> trimmed_s.fastq", work_dir) os.remove(work_dir + "trimmed_1_singletons.fastq") if os.path.exists(work_dir + "trimmed_2_singletons.fastq"): e("cat trimmed_2_singletons.fastq >> trimmed_s.fastq", work_dir) os.remove(work_dir + "trimmed_2_singletons.fastq")
def process_strain(ref_fasta, strain, read_paths, work_dir): """ :param ref_fasta: :param strain: :param read_paths: tuple with paths of (path_r1,path_r2,path_singles) :param work_dir: :return: """ out_bwa_bam = "final_bwa.bam" out_bwa_bam_idx = out_bwa_bam + ".bai" cwd = os.getcwd() if not os.path.exists(work_dir): os.makedirs(work_dir) try: os.chdir(work_dir) out_bwa_pe = "bwa_pe.sam" out_bwa_pe_bam = "bwa_pe.bam" out_unmapped_pe_bam = "unmapped.bam" if not os.path.exists(out_bwa_bam_idx) and not os.path.exists( out_bwa_pe_bam): e('bwa mem -R "@RG\\tID:illumina\\tSM:{ncepa}\\tLB:{ncepa}" {ref_fasta} {pe1} {pe2} > ' + out_bwa_pe, ref_fasta=ref_fasta, ncepa=strain, pe1=read_paths[0], pe2=read_paths[1]) e("samtools view -F 4 -Sbh %s > %s" % (out_bwa_pe, out_bwa_pe_bam)) e("samtools view -f 4 -Sbh %s > %s" % (out_bwa_pe, out_unmapped_pe_bam)) unmapped_pair_1 = "unmapped_pair_1.fastq" unmapped_pair_2 = "unmapped_pair_2.fastq" e("bedtools bamtofastq -i {ubam} -fq {upair} -fq2 {upair2}", upair2=unmapped_pair_2, upair=unmapped_pair_1, ubam=out_unmapped_pe_bam) out_bwa_pe_bam = Mapping.realign(out_bwa_pe_bam, ref_fasta) for x in [out_bwa_pe, out_unmapped_pe_bam]: if os.path.exists(x): os.remove(x) out_bwa_se = "bwa_se.sam" out_bwa_se_bam = "bwa_se.bam" out_unmapped_se_bam = "unmapped_se.bam" if not os.path.exists(out_bwa_bam_idx) and not os.path.exists( out_bwa_se_bam): e('bwa mem -R "@RG\\tID:illumina\\tSM:{ncepa}\\tLB:{ncepa}" {ref_fasta} {s1} > ' + out_bwa_se, ref_fasta=ref_fasta, ncepa=strain, s1=read_paths[2]) e("samtools view -F 4 -Sbh %s > %s" % (out_bwa_se, out_bwa_se_bam)) e("samtools view -f 4 -Sbh %s > %s" % (out_bwa_se, out_unmapped_se_bam)) unmapped_single = "unmapped_single.fastq" e("bedtools bamtofastq -i {ubam} -fq {upair} ", upair=unmapped_single, ubam=out_unmapped_se_bam) out_bwa_se_bam = Mapping.realign(out_bwa_se_bam, ref_fasta) for x in [out_bwa_se, out_unmapped_se_bam]: if os.path.exists(x): os.remove(x) out_bwa_raw_bam = "bwa_raw.bam" out_bwa_fm_bam = "bwa_fm.bam" out_bwa_fm_sort_bam = "bwa_fm_sort.bam" if not os.path.exists(out_bwa_bam_idx): e("samtools merge %s %s %s " % (out_bwa_raw_bam, out_bwa_pe_bam, out_bwa_se_bam)) e("samtools sort -n -o %s %s" % (out_bwa_fm_sort_bam, out_bwa_raw_bam)) e("samtools fixmate %s %s" % (out_bwa_fm_sort_bam, out_bwa_fm_bam)) e("samtools sort -o %s %s" % (out_bwa_bam, out_bwa_fm_bam)) e("samtools index %s" % out_bwa_bam) for x in [ out_bwa_raw_bam, out_bwa_fm_bam, out_bwa_fm_sort_bam, out_bwa_pe_bam, out_bwa_se_bam, out_bwa_pe_bam + ".bai", out_bwa_se_bam + ".bai" ]: if os.path.exists(x): os.remove(x) if not os.path.exists("flagstat.txt"): e("samtools flagstat %s > %s" % (out_bwa_bam, "flagstat.txt")) finally: os.chdir(cwd)
def realign(bam_file, ref_fasta): out_bwa_bam = "sorted_" + bam_file e("samtools sort -o %s %s" % (out_bwa_bam, bam_file)) out_bwa_final_bam = "realigned2_" + bam_file out_bwa_intervals = bam_file + ".intervals" out_bwa_intervals2 = bam_file + ".intervals" bwa_realigned = "realigned_" + bam_file duplicates = "duplicates_" + bam_file bwa_iter1 = "iter1_" + bam_file if not os.path.exists(out_bwa_final_bam): e("samtools index %s" % out_bwa_bam) e("gatk -T RealignerTargetCreator -R {ref} -I {input} -o {out}", ref=ref_fasta, input=out_bwa_bam, out=out_bwa_intervals) e("gatk -T IndelRealigner -R {ref} -I {input} -targetIntervals {intervals} -o {output}", ref=ref_fasta, input=out_bwa_bam, intervals=out_bwa_intervals, output=bwa_realigned) # Aca se recomienda correr el BaseRecalibrator de GATK pero no se tiene un vcf con variantes comunes e("picard MarkDuplicates I={input} REMOVE_DUPLICATES=true O={output} M={duplicates}", input=bwa_realigned, output=bwa_iter1, duplicates=duplicates) e("samtools index {input}", input=bwa_iter1) e("gatk -T RealignerTargetCreator -R {ref} -I {input} -o {intervals}", ref=ref_fasta, input=bwa_iter1, intervals=out_bwa_intervals2) e("gatk -T IndelRealigner -R {ref} -I {input} -targetIntervals {intervals} -o {output}", ref=ref_fasta, input=bwa_iter1, intervals=out_bwa_intervals2, output=out_bwa_final_bam) e("samtools index %s" % out_bwa_final_bam) for x in [ bam_file, out_bwa_intervals, out_bwa_intervals2, bwa_realigned, bwa_iter1, bwa_iter1 + ".bai" ]: if os.path.exists(x): os.remove(x) return out_bwa_final_bam
def variant_call(work_dir, record, alignment, strain): work_dir = os.path.abspath(work_dir) + "/" record = os.path.abspath(record) alignment = os.path.abspath(alignment) # Call variants in the sequence data e("java -jar $GATK -T HaplotypeCaller -R {record_name} -I {alignment} -gt_mode DISCOVERY -ploidy 1 -stand_call_conf 30 -o raw_variants.vcf", work_dir, record_name=record, alignment=alignment) # Apply hard filters to a call set e("java -jar $GATK -T SelectVariants -R {record_name} -V raw_variants.vcf -selectType SNP -o raw_snps.vcf", work_dir, record_name=record) e("java -jar $GATK -T VariantFiltration -R {record_name} -V raw_snps.vcf -filter \"QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0\" --filterName \"my_snp_filter\" -o filtered_snps.vcf", work_dir, record_name=record) e("java -jar $GATK -T SelectVariants -R {record_name} -V raw_variants.vcf -selectType INDEL -o raw_indels.vcf", work_dir, record_name=record) e("java -jar $GATK -T VariantFiltration -R {record_name} -V raw_indels.vcf -filter \"QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0\" --filterName \"my_indel_filter\" -o filtered_indels.vcf", work_dir, record_name=record) e("java -jar $GATK -T CombineVariants --assumeIdenticalSamples -R {record_name} -V filtered_snps.vcf -V filtered_indels.vcf -genotypeMergeOptions UNIQUIFY -o concatenated.vcf", work_dir, record_name=record) # Removes column from vcf header e( "sed \'/^#[^#]/ {{s/\\t%s\\.variant2//}}\' concatenated.vcf > %s.vcf" % (strain, "final.vcf"), work_dir) return strain + ".vcf"
def alignment(work_dir, record, trimmed_1="trimmed_1.fastq", trimmed_2="trimmed_2.fastq", cpus=multiprocessing.cpu_count(), strain="sample1", species=None): if not species: species = strain work_dir = os.path.abspath(work_dir) + "/" record = os.path.abspath(record) # Generate a SAM file containing aligned reads e("bwa mem -t {cpus} -M -R \'@RG\\tID:group1\\tSM:{strain}\\tPL:illumina\\tLB:{species}\' {record_name} {trimmed_1} {trimmed_2} > aligned_reads.sam", work_dir, record_name=record, strain=strain, species=species, cpus=cpus, trimmed_1=trimmed_1, trimmed_2=trimmed_2) # Filter mapped reads and convert to BAM e("samtools view -@ {cpus} -F 4 -S -b -h aligned_reads.sam > mapped_reads.bam", work_dir, cpus=cpus) e("samtools view -@ {cpus} -f 4 -S -b -h aligned_reads.sam > unmapped_reads.bam", work_dir, cpus=cpus) os.remove(work_dir + "aligned_reads.sam") # Convert back to FASTQ for quality control e("samtools fastq mapped_reads.bam > mapped_reads.fastq", work_dir) e("fastqc mapped_reads.fastq", work_dir) # Sort and mark duplicates e( "java -jar $PICARD SortSam INPUT=mapped_reads.bam OUTPUT=sorted_reads.bam SORT_ORDER=coordinate", work_dir) e( "java -jar $PICARD MarkDuplicates INPUT=sorted_reads.bam OUTPUT=dedup_reads.bam METRICS_FILE=metrics.txt", work_dir) e("java -jar $PICARD BuildBamIndex INPUT=dedup_reads.bam", work_dir) return work_dir + "dedup_reads.bam"
def alignment(wd, ref, trimmed_1="trimmed_1.fastq", trimmed_2="trimmed_2.fastq", cpus=multiprocessing.cpu_count(), strain="sample1", species=None, force=False, read_group="group1"): if not species: species = strain mkdir(wd) wd = os.path.abspath(wd) + "/" ref = os.path.abspath(ref) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(ref), f'{ref} does not exist' assert os.path.exists(trimmed_1), f'{trimmed_1} does not exist' assert os.path.exists(trimmed_2), f'{trimmed_2} does not exist' # Generate a SAM file containing aligned reads if force or not os.path.exists(f"{wd}mapped_reads_raw.bam"): tab = "\\t" e(f"bwa mem -t {cpus} -M -R \'@RG{tab}ID:{read_group}{tab}SM:{strain}{tab}PL:illumina{tab}LB:{species}\' {ref} {trimmed_1} {trimmed_2} > {wd}aligned_reads.sam" ) assert os.path.getsize(f"{wd}aligned_reads.sam" ) > 10, f"{wd}aligned_reads.sam cant be empty" # Filter mapped reads and convert to BAM if force or (not os.path.exists(f"{wd}dedup.bam ") and not os.path.exists(f"{wd}mapped_reads_raw.bam")): e(f"samtools view -@ {cpus} -F 4 -S -b -h {wd}aligned_reads.sam | samtools sort - > {wd}mapped_reads_raw.bam" ) e(f"samtools view -@ {cpus} -f 4 -S -b -h {wd}aligned_reads.sam > {wd}unmapped_reads.bam" ) e(f"bedtools bamtofastq -i unmapped_reads.bam -fq {wd}unmapped_1.fastq -fq2 {wd}unmapped_2.fastq" ) if os.path.exists(f"{wd}unmapped_reads.bam"): os.remove(f"{wd}unmapped_reads.bam") if os.path.exists(f"{wd}aligned_reads.sam"): os.remove(f"{wd}aligned_reads.sam") # Sort and mark duplicates e(f"gatk MarkDuplicates -INPUT {wd}mapped_reads_raw.bam -OUTPUT {wd}dedup.bam -METRICS_FILE {wd}metrics.txt" ) assert os.path.getsize( f"{wd}dedup.bam") > 10, f"{wd}dedup.bam cant be empty" os.remove(f"{wd}mapped_reads_raw.bam") e(f'samtools sort {wd}dedup.bam > {wd}mapped_reads.bam') os.remove(f"{wd}dedup.bam") e(f'samtools index {wd}mapped_reads.bam') e(f"gatk CollectInsertSizeMetrics --I {wd}mapped_reads.bam --O {wd}insert_size_metrics.txt --H {wd}insert_size_histogram.pdf --M 0.5" ) return f'{wd}mapped_reads.bam'
def init_ref(reference_path): last = reference_path.split(".")[-1] if last == "gz": last = reference_path.split(".")[-2] + ".gz" assert last in ["fna", "fasta", "fa", "fna.gz", "fasta.gz", "fa.gz"], f'unknown extension for {reference_path}' dict_path = reference_path.replace("." + last, ".dict") e(f"bwa index -a is {reference_path}") e(f"samtools dict {reference_path} > {dict_path}") if last.endswith(".gz"): e(f"zcat {reference_path}| bgzip > {reference_path}.tmp; cp '{reference_path}.tmp' '{reference_path}' && rm '{reference_path}.tmp'" ) e(f"samtools faidx {reference_path}") # e("bowtie2-build {record_name} {record_name}".format(record_name=filename)) if reference_path.endswith(".gz"): e(f"zcat {reference_path} | makeblastdb -dbtype nucl -title {reference_path} -input_type fasta -out {reference_path} -in -" ) else: e(f"makeblastdb -dbtype nucl -in {reference_path} ")