def main(args): vcf = vcf_class(args.vcf) vcf.get_mean_genotype() vcf.get_genesum() geno_file = vcf.prefix+".geno" genesum_file = vcf.prefix+".genesum" meta = {} for row in csv.DictReader(open(args.pheno_csv)): if args.pheno_id_column not in row: quit(bcolors.FAIL + f"\nError: Can't find {args.pheno_id_column} in csv columns, set it with --pheno-id-column\n" + bcolors.ENDC) meta[row[args.pheno_id_column]] = row if args.pheno_name_file: phenos = [l.strip() for l in open(args.pheno_name_file)] else: phenos = args.pheno_names cmd_file = str(uuid4()) X = open(cmd_file,"w") for pheno in phenos: pheno_file = f"{pheno}.pheno" if pheno not in row: quit(bcolors.FAIL + f"{pheno} not in CSV file" + bcolors.ENDC) P = open(pheno_file,"w") P.write("\n".join([meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples])) P.close() X.write("gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file,geno_file,pheno,pheno_file,geno_file,pheno,pheno,pheno_file,genesum_file,pheno,pheno)) X.close() if args.preprocess: fm.log("Preprocessing finished\n", True) else: fm.run_cmd("cat %s | parallel -j %s" % (cmd_file,args.threads))
def main(args): if args.prefix: individual_bams = ["%s/%s%s" % (args.dir,run,args.suffix) for run in args.prefix.split("_")] new_id = args.new_id if args.new_id else args.prefix elif args.bams: individual_bams = args.bams.split(",") new_id = args.new_id if args.new_id else "_".join([bam.split("/")[-1].replace(args.suffix,"") for bam in individual_bams]) elif (not args.prefix and not args.bams) or (args.prefix and args.bams): sys.stderr.write("Need wither '--bams' or '--prefix'... Exiting!\n") quit() if len(individual_bams)==1: sys.stderr.write("Need more than one bam... Exiting!\n") quit() for bam in individual_bams: fm.filecheck(bam) new_bamfile = "%s/%s%s" % (args.dir,new_id,args.suffix) tmp_bamfile = fm.get_random_file() tmp_file = fm.get_random_file() with open(tmp_file,"w") as O: for l in fm.cmd_out("samtools view -H %s" % individual_bams[0]): row = l.strip().split("\t") if row[0]=="@RG": continue row[1] = "ID:%s" % new_id row[2] = "SM:%s" % new_id O.write("%s\n" % "\t".join(row)) fm.run_cmd("samtools merge -@ %s - %s | samtools reheader -i %s - | samtools addreplacerg -@ %s - -r 'ID:%s\\tSM:%s\\tPL:Illumina' -o %s" % ( args.threads," ".join(individual_bams), tmp_file, args.threads,new_id, new_id, new_bamfile) ) fm.run_cmd("samtools index %s" % new_bamfile) fm.rm_files([tmp_file,tmp_bamfile])
def main(args): fm.filecheck(args.query) fm.filecheck(args.subject) ref_gene_seq = list(fm.fasta(args.query).fa_dict.values())[0] start_anchor = ref_gene_seq[:args.anchor_size] end_anchor = ref_gene_seq[-args.anchor_size:] tmp_in = fm.get_random_file() tmp_out = fm.get_random_file() with open(tmp_in, "w") as O: O.write(">tmp\n%s" % start_anchor) fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" % (tmp_in, args.subject, tmp_out), verbose=0) start_hits = parse_blast(tmp_out, args.anchor_size * 0.9) with open(tmp_in, "w") as O: O.write(">tmp\n%s" % end_anchor) fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" % (tmp_in, args.subject, tmp_out), verbose=0) end_hits = parse_blast(tmp_out, args.anchor_size * 0.9) fm.rm_files([tmp_in, tmp_out]) result_type = "" if args.strict_one_hit and (len(start_hits) > 1 or len(end_hits) > 1): result_type = "NA" else: if start_hits[0]["subject_seq"] == end_hits[0]["subject_seq"]: result_type = "OK" start_hit = start_hits[0] end_hit = end_hits[0] else: result_type = "Fragmented" with open("%s.result.txt" % args.prefix, "w") as O: O.write("%s\t%s\n" % (args.prefix, result_type)) if result_type != "OK": quit() print(start_hit, end_hit) subject_seqs = fm.fasta(args.subject).fa_dict if start_hit["subject_strand"] == "Plus" and end_hit[ "subject_strand"] == "Plus": hit_seq = subject_seqs[ start_hit["subject_seq"]][start_hit["subject_start"] - 1:end_hit["subject_end"]] elif start_hit["subject_strand"] == "Minus" and end_hit[ "subject_strand"] == "Minus": hit_seq = revcom(subject_seqs[start_hit["subject_seq"]] [end_hit["subject_end"] - 1:start_hit["subject_start"]]) # import pdb; pdb.set_trace() with open("%s.extracted_seq.fa" % args.prefix, "w") as O: O.write(">%s\n%s\n" % (args.prefix, hit_seq))
def main_trim(args): if args.single: fm.run_cmd( "trimmomatic SE -phred33 %(read1)s %(prefix)s_trimmed.fq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:36" % vars(args)) else: fm.run_cmd( "trimmomatic PE -phred33 %(read1)s %(read2)s -baseout %(prefix)s LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:36" % vars(args))
def main_gatk(args): if not args.prefix: args.prefix = args.bam.replace(".bam", "") fm.run_cmd( "gatk HaplotypeCaller -I %(bam)s -R %(ref)s -O %(prefix)s.g.vcf.gz -ERC %(erc)s" % vars(args)) fm.run_cmd( "gatk ValidateVariants -V %(prefix)s.g.vcf.gz -gvcf -R %(ref)s && touch %(prefix)s.g.vcf.gz.validated" % vars(args))
def download_files(directory=None): sys.stderr.write("Downloading required files\n") import urllib.request if not directory: directory = "%s/.taxonkit/" % os.path.expanduser("~") if not os.path.isdir(directory): os.mkdir(directory) urllib.request.urlretrieve( 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz ', '%s/taxdump.tar.gz' % directory) fm.run_cmd("tar -C %s -xvf %s/taxdump.tar.gz" % (directory, directory))
def main(args): vcf_obj = vcf_class(args.vcf) run_cmd( "plink --vcf %(vcf)s --distance square --double-id --allow-extra-chr --vcf-half-call missing --out %(vcf)s" % vars(args)) outfile = open("%s.dists" % vcf_obj.prefix, "w") outfile.write("%s\n" % "\t".join(vcf_obj.samples)) for l in open("%s.dist" % args.vcf): row = l.strip().split() outfile.write("%s\n" % "\t".join([str(float(x) / 2) for x in row])) outfile.close() run_cmd("rm %(vcf)s.dist %(vcf)s.log" % vars(args))
def __init__(self, filename, threads=4): self.samples = [] self.filename = filename self.threads = threads self.prefix = get_vcf_prefix(filename) if nofile(filename + ".csi"): run_cmd("bcftools index %(filename)s" % vars(self)) self.temp_file = get_random_file() run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self)) for l in open(self.temp_file): self.samples.append(l.rstrip()) os.remove(self.temp_file)
def main(args): check_programs(["taxonkit", "seqtk"]) if not os.path.isdir( "%s/.taxonkit/" % os.path.expanduser("~")) or not os.path.isfile( "%s/.taxonkit/nodes.dmp" % os.path.expanduser("~")): download_files() nodes = set() sys.stderr.write("Loading taxonomy\n") cmd = "taxonkit list --ids %s" % (args.extract if args.extract else args.exclude) for l in fm.cmd_out(cmd): if l == "": continue row = l.strip().split() nodes.add(row[0]) sys.stderr.write("Extracting read names\n") args.tmp_file = str(uuid4()) total_reads = 0 kept_reads = 0 with open(args.tmp_file, "w") as O: if args.exclude: for l in tqdm(open(fm.filecheck(args.kraken2_output))): total_reads += 1 row = l.strip().split() if row[2] not in nodes: O.write("%s\n" % row[1]) kept_reads += 1 else: for l in tqdm(open(fm.filecheck(args.kraken2_output))): total_reads += 1 row = l.strip().split() if row[2] in nodes: O.write("%s\n" % row[1]) kept_reads += 1 sys.stderr.write("Writing filtered fastq files\n") fm.filecheck(args.R1) args.R1_filt = args.R1.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" % vars(args)) if args.R2: fm.filecheck(args.R2) args.R2_filt = args.R2.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" % vars(args)) fm.rm_files([args.tmp_file]) sys.stderr.write("\nKept %s/%s reads\n" % (kept_reads, total_reads))
def main(args): nodes = defaultdict(set) sys.stderr.write("Loading taxonomy\n") for l in tqdm(open(fm.filecheck(args.tax_dump))): row = l.strip().split() nodes[row[2]].add(row[0]) def flatten(d): v = [[i] if not isinstance(i, list) else flatten(i) for i in d] return [i for b in v for i in b] def get_tax(t): if len(nodes[t]) == 0: return [t] return [t] + flatten([get_tax(sub_t) for sub_t in nodes[t]]) sys.stderr.write("Extracting read names\n") args.tmp_file = str(uuid4()) with open(args.tmp_file, "w") as O: if args.exclude: tax_tree = set( flatten([get_tax(x) for x in args.exclude.split(",")])) for l in tqdm(open(fm.filecheck(args.kraken2_output))): row = l.strip().split() if row[2] not in tax_tree: O.write("%s\n" % row[1]) else: tax_tree = set( flatten([get_tax(x) for x in args.extract.split(",")])) for l in tqdm(open(fm.filecheck(args.kraken2_output))): row = l.strip().split() if row[2] in tax_tree: O.write("%s\n" % row[1]) sys.stderr.write("Writing filtered fastq files\n") fm.filecheck(args.R1) args.R1_filt = args.R1.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" % vars(args)) if args.R2: fm.filecheck(args.R2) args.R2_filt = args.R2.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" % vars(args)) fm.rm_files([args.tmp_file])
def main(args): vcf_obj = vcf_class(args.vcf) if args.whole_genome: args.sample_file = get_random_file() open(args.sample_file, "w").write("\n".join(vcf_obj.samples) + "\n") run_cmd( 'cat %(sample_file)s | parallel --bar -j %(threads)s "bcftools consensus -f %(ref)s -s {} %(vcf)s | sed \'s/^>.*/>{}/\' > {}.tmp.fasta"' % vars(args)) run_cmd('cat %s > %s.fa' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), vcf_obj.prefix)) run_cmd('rm %s %s' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), args.sample_file)) if args.tree: run_cmd("iqtree -s %s.fa -m GTR+G+ASC -nt AUTO" % vcf_obj.prefix) else: fasta_file = vcf_obj.vcf_to_fasta(args.ref, nofilt=args.snps_no_filt) if args.tree: run_cmd("iqtree -s %s -m GTR+G+ASC -nt AUTO" % fasta_file)
def main(args): which("bedtools") if not os.path.isfile(args.bam + ".genomecov.txt"): fm.run_cmd("bedtools genomecov -ibam %(bam)s > %(bam)s.genomecov.txt" % vars(args)) dp = defaultdict(dict) for l in open(args.bam + ".genomecov.txt"): row = l.strip().split() dp[row[0]][int(row[1])] = { "freq": int(row[2]), "fraction": float(row[4]) } with open(args.out, "w") as O: writer = csv.DictWriter( O, fieldnames=["chrom", "mean", "std", "dp_0", "dp_5", "dp_10"]) writer.writeheader() print() for chrom in dp: d1 = statsmodels.stats.weightstats.DescrStatsW( list(dp[chrom].keys()), [x["freq"] for x in dp[chrom].values()]) # import pdb; pdb.set_trace() res = { "chrom": chrom, "mean": d1.mean, "std": d1.std, "dp_0": (1 - sum([dp[chrom][x]["fraction"] for x in [0]])) * 100, "dp_5": (1 - sum([dp[chrom][x]["fraction"] for x in range(6)])) * 100, "dp_10": (1 - sum([dp[chrom][x]["fraction"] for x in range(11)])) * 100 } writer.writerow(res)
def main(args): randint = rand_generator.randint(1, 999999) window_cmd = "bedtools makewindows -n %(chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % vars( args) cmd_to_run = "\"bcftools view --threads %s -r {1} %s -Ou | %s | bcftools view -Oz -o %s_{2}.vcf.gz\"" % ( args.compression_threads, args.vcf, args.cmd, randint) fm.run_cmd( f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {cmd_to_run}", verbose=2) fm.run_cmd( "bcftools concat -Oz -o %s `%s | awk '{print \"%s_\"$2\".vcf.gz\"}'`" % (args.out, window_cmd, randint)) fm.run_cmd("rm `%s | awk '{print \"%s_\"$2\".vcf.gz*\"}'`" % (window_cmd, randint))
def main_genotype(args): conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json"))) params = vars(args) params["num_genome_chunks"] = conf["num_genome_chunks"] window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params params["window_cmd"] = window_cmd # Check folders exist for l in cmd_out(window_cmd): row = l.strip().split() dirname = "%s_%s_genomics_db" % (args.prefix,row[1]) sys.stderr.write("Looking for direcotry named %s..." % dirname) foldercheck(dirname) sys.stderr.write("OK\n") genotype_cmd = "gatk --java-options \"-Xmx40g\" GenotypeGVCFs -R %(ref)s -V gendb://%(prefix)s_{2}_genomics_db -O %(prefix)s.{2}.genotyped.vcf.gz" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {genotype_cmd}",verbose=2) run_cmd("bcftools concat -Oz -o %(prefix)s.%(subfix_vcf)s.genotyped.vcf.gz `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz\"}'`" % params) run_cmd("rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz*\"}'`" % params)
def vcf_to_matrix(self, iupacgt=True): self.matrix_file = self.prefix + ".mat" self.binary_matrix_file = self.prefix + ".mat.bin" if args.no_iupacgt: self.matrix_file = self.prefix + ".noniupac.mat" O = open(self.matrix_file, "w").write("chr\tpos\tref\t%s\n" % ("\t".join(self.samples))) run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%TGT]\\n' %(filename)s | sed 's/\.\/./N/g; s/\([ACTG]\)\///g; s/|//g' | sed -r 's/([ACGT])\\1+/\\1/g' >> %(matrix_file)s" % vars(self)) else: O = open(self.matrix_file, "w").write("chr\tpos\tref\t%s\n" % ("\t".join(self.samples))) run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%IUPACGT]\\n' %(filename)s | tr '|' '/' | sed 's/\.\/\./N/g' >> %(matrix_file)s" % vars(self)) O = open(self.binary_matrix_file, "w").write("chr\tpos\tref\t%s\n" % ("\t".join(self.samples))) run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%GT]\\n' %(filename)s | tr '|' '/' | sed 's/\.\/\./N/g' | sed 's/0\/1/0.5/g' | sed 's/1\/1/1/g' | sed 's/0\/0/0/g' >> %(binary_matrix_file)s" % vars(self))
def main(args): original_vcf = fm.vcf_class(args.vcf, threads=args.threads) args.prefix = original_vcf.prefix args.filename = original_vcf.filename args.indels_cmd = "" if args.keep_indels else "bcftools view -V indels | " args.exclude_cmd = f"bcftools view -T ^{args.exclude_bed} |" if args.exclude_bed else "" args.annotation_drop = "" if args.keep_genotype_info else "bcftools annotate -x ^FORMAT/GT | " args.window_cmd = "bedtools makewindows -g %(ref)s.fai -n 20 | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % vars( args) args.filter_cmd = ( "%(indels_cmd)s" "%(exclude_cmd)s" "setGT.py | " "%(annotation_drop)s" "bcftools view -c 1 -a -Ou | " "bcftools filter -e 'GT=\\\"het\\\"' -S . | " "bcftools view -i 'F_PASS(GT!=\\\"mis\\\")>%(site_missing)s' | " "bcftools view -c 1 | " "bcftools +fill-tags | " "bcftools view -e 'AF==1 || AF==0' | " "bcftools norm -f %(ref)s" % vars(args)) if args.keep_indels: args.final_file = "%(prefix)s.filtered.vcf.gz" % vars(args) else: args.final_file = "%(prefix)s.filtered_no_indels.vcf.gz" % vars(args) fm.run_cmd( "%(window_cmd)s | parallel -j %(threads)s --col-sep \" \" \"bcftools view %(filename)s -r {1} | %(filter_cmd)s > %(prefix)s.{2}.tmp.txt\"" % vars(args)) fm.run_cmd( "bcftools concat -Oz -o %(final_file)s `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".tmp.txt\"}'`" % vars(args)) fm.run_cmd( "rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".tmp.txt*\"}'`" % vars(args))
def main(args): samples = [] for l in open(args.samples): samples = [x.rstrip() for x in open(args.samples).readlines()] for s in samples: fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension)) if fm.nofolder("%(dir)s/kraken" % vars(args)): fm.run_cmd("%(dir)s/kraken" % vars(args)) args.cmd_file = fm.get_random_file() with open(args.cmd_file, "w") as O: for s in samples: args.sample = s if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" % vars(args)): O.write( "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n" % (s, args.dir, s, args.alignment_extension, args.dir, s)) if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)): O.write( "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n" % vars(args)) fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" % vars(args), verbose=2) fm.rm_files([args.cmd_file]) sample_metrics = [] for s in samples: res = {"sample": s} args.sample = s for i, l in enumerate( open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" % vars(args))): row = l.rstrip().split() if i in [2, 3]: res[row[3]] = int(row[0]) elif i == 4: res[row[3]] = int(row[0]) res["mapped_percent"] = float(row[4].replace("(", "").replace( "%", "")) else: pass kraken_results = {} for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)): row = l.strip().split() if row[3] not in kraken_results: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) if float(row[0]) > kraken_results[row[3]][0]: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1], kraken_results["G"][0]) res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1], kraken_results["G1"][0]) res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1], kraken_results["S"][0]) tbprofiler_result = json.load( open("%(dir)s/tbprofiler/results/%(sample)s.results.json" % vars(args))) res["lineage"] = tbprofiler_result["main_lin"] res["sub-lineage"] = tbprofiler_result["sublin"] res["drtype"] = tbprofiler_result["drtype"] tmp_drugs = defaultdict(list) for var in tbprofiler_result["dr_variants"]: for d in var["drugs"]: tmp_drugs[d["drug"]].append( "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"])) for d in drugs: res[d] = ", ".join(tmp_drugs[d]) sample_metrics.append(res) with open(args.out + ".sample_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0])) writer.writeheader() writer.writerows(sample_metrics) vcf = fm.vcf_class(args.vcf) if fm.nofile(args.vcf + ".stats.txt"): fm.run_cmd( "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt" % (vars(args))) vcf_stats = vcf.load_stats() results = { "number of samples": vcf_stats["number of samples"], "number of records": vcf_stats["number of records"], "number of SNPs": vcf_stats["number of SNPs"], "number of indels": vcf_stats["number of indels"], } snp_results = [] if fm.nofile(args.vcf + ".csq_info.txt"): fm.run_cmd( "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt" % vars(args)) fm.run_cmd( "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt" % vars(args)) variant_info = vcf.get_variant_data(args.ref, args.gff) with open(args.out + ".variant_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(variant_info[0])) writer.writeheader() writer.writerows(variant_info)
def main(args): samples = [] reader = csv.DictReader(open(args.index_file)) if "sample" not in reader.fieldnames: reader = csv.DictReader(open(args.index_file, encoding='utf-8-sig')) for row in reader: if row["sample"] == "": continue samples.append(row["sample"]) fm.bwa_index(args.ref) fm.create_seq_dict(args.ref) fm.faidx(args.ref) cmd = "demultiplex_fastq.py --R1 %(read1)s --R2 %(read2)s --index %(index_file)s" % vars( args) if args.search_flipped_index: cmd += " --search-flipped-index" run_cmd(cmd) for sample in samples: args.sample = sample run_cmd("fastqc %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz" % vars(args)) if args.trim: run_cmd( "trimmomatic PE %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz %(sample)s_1.trimmed.fastq.gz %(sample)s_1.untrimmed.fastq.gz %(sample)s_2.trimmed.fastq.gz %(sample)s_2.untrimmed.fastq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:%(trim_qv)s MINLEN:36 2> %(sample)s.trimlog" % vars(args)) run_cmd( "bwa mem -t 10 -R \"@RG\\tID:%(sample_prefix)s%(sample)s\\tSM:%(sample_prefix)s%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.trimmed.fastq.gz %(sample)s_2.trimmed.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -" % vars(args)) else: run_cmd( "bwa mem -t 10 -R \"@RG\\tID:%(sample_prefix)s%(sample)s\\tSM:%(sample_prefix)s%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -" % vars(args)) run_cmd("samtools index %(sample)s.bam" % vars(args)) run_cmd("samtools flagstat %(sample)s.bam > %(sample)s.flagstat.txt" % vars(args)) run_cmd( "mosdepth -x -b %(bed)s %(sample)s --thresholds 1,10,20,30 %(sample)s.bam" % vars(args)) run_cmd( "bedtools coverage -a %(bed)s -b %(sample)s.bam -mean > %(sample)s_region_coverage.txt" % vars(args)) run_cmd( "sambamba depth base %(sample)s.bam > %(sample)s.coverage.txt" % vars(args)) run_cmd( "freebayes -f %(ref)s -t %(bed)s %(sample)s.bam --haplotype-length -1> %(sample)s.freebayes.vcf" % vars(args)) run_cmd( "gatk HaplotypeCaller -R %(ref)s -L %(bed)s -I %(sample)s.bam -O %(sample)s.gatk.vcf" % vars(args)) if not args.per_sample_only: with open("vcf_list.txt", "w") as O: for s in samples: O.write("%s.freebayes.vcf") O.write("%s.gatk.vcf") for sample in samples: args.sample = sample run_cmd( "naive_variant_caller.py --ref %(ref)s --bam %(sample)s.bam --sample %(sample)s --min-af %(min_sample_af)s --vcf-file-list vcf_list.txt | bcftools view -Oz -o %(sample)s.vcf.gz" % vars(args)) run_cmd("tabix -f %(sample)s.vcf.gz" % vars(args)) with open("vcf_list.txt", "w") as O: for s in samples: O.write("%s.vcf.gz\n" % (s)) run_cmd("bcftools merge -l vcf_list.txt -Oz -o combined.vcf.gz") run_cmd( r"bcftools query -f '%CHROM\t%POS[\t%DP]\n' combined.vcf.gz > tmp.txt" ) run_cmd( "bcftools filter -i 'FMT/DP>10' -S . combined.vcf.gz | bcftools sort -Oz -o tmp.vcf.gz" % vars(args)) run_cmd( "bcftools view -v snps tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o snps.vcf.gz" % vars(args)) run_cmd("tabix snps.vcf.gz" % vars(args)) run_cmd( "bcftools view -v indels tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o indels.vcf.gz" % vars(args)) run_cmd("tabix indels.vcf.gz" % vars(args)) run_cmd( r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt" ) run_cmd( r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.snps.trans.txt" ) run_cmd( r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.indels.txt" ) run_cmd( r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.indels.trans.txt" ) bedlines = [] amplicon_positions = [] for l in open(args.bed): row = l.strip().split() bedlines.append(row) for p in range(int(row[1]), int(row[2])): amplicon_positions.append((row[0], p)) def overlap_bedlines(a, bedlines): overlaps = [] for b in bedlines: if b[0] == a[0]: overlap = max( 0, min(int(a[2]), int(b[2])) - max(int(a[1]), int(b[1]))) if overlap > 0: overlaps.append([ b[0], max(int(a[1]), int(b[1])), min(int(a[2]), int(b[2])) ]) return overlaps dp = defaultdict(dict) for s in samples: for l in gzip.open(f"{s}.per-base.bed.gz"): row = l.decode().strip().split() overlaps = overlap_bedlines(row, bedlines) if len(overlaps) > 0: for overlap in overlaps: for pos in range(int(overlap[1]), int(overlap[2])): dp[s][(row[0], pos)] = int(row[3]) pos_info = {} for l in open(args.position_info): row = l.strip().split() pos_info[(row[0], int(row[1]))] = (row[2], row[3]) with open("depth_info.txt", "w") as O: O.write("chrom\tpos\tgene\tcsq\t%s\n" % "\t".join(samples)) for chrom, pos in amplicon_positions: if (chrom, pos) in pos_info: d = pos_info[(chrom, pos)] O.write( "%s\t%s\t%s\t%s\t%s\n" % (chrom, pos, d[0], d[1], "\t".join( [str(dp[s].get((chrom, pos), 0)) for s in samples])))
def main(args): args.region_arg = "" if args.variant_caller == "gatk": if args.bed: args.region_arg = "-L %s" % args.bed fm.run_cmd( "gatk HaplotypeCaller -R %(ref)s %(region_arg)s -I %(bam)s -O %(out)s.vcf.gz" % vars(args)) elif args.variant_caller == "bcftools": if args.bed: args.region_arg = "-R %s" % args.bed fm.run_cmd( "bcftools mpileup -f %(ref)s %(region_arg)s %(bam)s | bcftools call -mv -Oz -o %(out)s.vcf.gz" % vars(args)) elif args.variant_caller == "freebayes": if args.bed: args.region_arg = "-t %s" % args.bed fm.run_cmd( "freebayes -f %(ref)s %(region_arg)s %(bam)s | bgzip -c > %(out)s.vcf.gz" % vars(args)) else: quit("Unknown variant caller! Exiting!") fm.run_cmd("tabix -f %(out)s.vcf.gz" % vars(args)) if args.bed: fm.run_cmd( "bedtools coverage -a %(bed)s -b %(bam)s -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2+$(NF-1)-2\"\\t\"$2+$(NF-1)-1}' > %(out)s.depth_mask.bed" % vars(args)) else: fm.run_cmd( "bedtools genomecov -ibam %(bam)s -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2-1\"\\t\"$2}' > %(out)s.depth_mask.bed" % vars(args)) for l in fm.cmd_out("wc -l %(out)s.depth_mask.bed" % vars(args)): num_lines = int(l.strip().split()[0]) args.mask_arg = "-m %(out)s.depth_mask.bed -M N" % vars( args) if num_lines > 0 else "" region_names = {} if args.bed: regions_file = args.out + ".regions.txt" with open(regions_file, "w") as O: for l in open(args.bed): row = l.strip().split() r = "%s:%s-%s" % (row[0], row[1], row[2]) O.write(r + "\n") if len(row) > 3: region_names[r] = row[3] args.region_arg = "-r %s" % regions_file consensus_cmd = "samtools faidx %(ref)s %(region_arg)s | bcftools consensus %(out)s.vcf.gz %(mask_arg)s" % vars( args) else: consensus_cmd = "bcftools consensus -f %(ref)s %(out)s.vcf.gz %(mask_arg)s" % vars( args) with open(args.out + ".consensus.fa", "w") as O: for l in fm.cmd_out(consensus_cmd): if l[0] == ">": r = l.strip()[1:] O.write(">%s %s\n" % (args.out, region_names.get(r, r))) else: O.write(l + "\n")
def main(args): samples = [] for f in args.index_files: for row in csv.DictReader(open(f)): if row["sample"] in samples: sys.stderr.write( f"Warning! You have a duplicate sample name: {row['sample']}\n" ) samples.append(row["sample"]) with open("vcf_files.txt", "w") as O: for s in samples: O.write(f"{s}.freebayes.vcf\n") O.write(f"{s}.gatk.vcf\n") for sample in samples: args.sample = sample run_cmd( "naive_variant_caller.py --ref %(ref)s --bam %(sample)s.bam --sample %(sample)s --min-af %(min_sample_af)s --vcf-file-list vcf_files.txt | bcftools view -Oz -o %(sample)s.vcf.gz" % vars(args)) run_cmd("tabix -f %(sample)s.vcf.gz" % vars(args)) with open("vcf_list.txt", "w") as O: for s in samples: O.write("%s.vcf.gz\n" % (s)) run_cmd("bcftools merge -l vcf_list.txt -Oz -o combined.vcf.gz") run_cmd( "bcftools filter -i 'FMT/DP>10' -S . combined.vcf.gz | bcftools sort -Oz -o tmp.vcf.gz" % vars(args)) run_cmd( "bcftools view -v snps tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o snps.vcf.gz" % vars(args)) run_cmd("tabix snps.vcf.gz" % vars(args)) run_cmd( "bcftools view -v indels tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o indels.vcf.gz" % vars(args)) run_cmd("tabix indels.vcf.gz" % vars(args)) run_cmd( r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt" ) run_cmd( r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.snps.trans.txt" ) run_cmd( r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.indels.txt" ) run_cmd( r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.indels.trans.txt" ) bedlines = [] amplicon_positions = [] for l in open(args.bed): row = l.strip().split() bedlines.append(row) for p in range(int(row[1]), int(row[2])): amplicon_positions.append((row[0], p)) def overlap_bedlines(a, bedlines): overlaps = [] for b in bedlines: if b[0] == a[0]: overlap = max( 0, min(int(a[2]), int(b[2])) - max(int(a[1]), int(b[1]))) if overlap > 0: overlaps.append([ b[0], max(int(a[1]), int(b[1])), min(int(a[2]), int(b[2])) ]) return overlaps dp = defaultdict(dict) for s in samples: for l in gzip.open(f"{s}.per-base.bed.gz"): row = l.decode().strip().split() overlaps = overlap_bedlines(row, bedlines) if len(overlaps) > 0: for overlap in overlaps: for pos in range(int(overlap[1]), int(overlap[2])): dp[s][(row[0], pos)] = int(row[3]) pos_info = {} for l in open(args.position_info): row = l.strip().split() pos_info[(row[0], int(row[1]))] = (row[2], row[3]) with open("depth_info.txt", "w") as O: O.write("chrom\tpos\tgene\tcsq\t%s\n" % "\t".join(samples)) for chrom, pos in amplicon_positions: if (chrom, pos) in pos_info: d = pos_info[(chrom, pos)] O.write("%s\t%s\t%s\t%s\t%s\n" % (chrom, pos, d[0], d[1], "\t".join( [str(dp[s].get((chrom, pos), 0)) for s in samples])))
def convert_to_cram(bam_file, ref_file, threads): cram_file = bam_file.replace(".bam", ".cram") fm.run_cmd("samtools view -@ %s -C %s -o %s -T %s" % (threads, bam_file, cram_file, ref_file)) fm.run_cmd("samtools index %s" % cram_file) fm.run_cmd("rm %s %s.bai" % (bam_file, bam_file))
def main_map(args): args.step = get_step_num(args.prefix) if "trimmed" in vars(args) and args.single: args.reads = "%(prefix)s_trimmed.fq" % vars(args) elif "trimmed" in vars(args) and not args.single: args.reads = "%(prefix)s_1P %(prefix)s_2P" % vars(args) elif "trimmed" not in vars(args) and args.single: args.reads = "%(read1)s %(read2)s" % vars(args) elif "trimmed" not in vars(args) and not args.single: args.reads = "%(read1)s" % vars(args) if args.redo or args.step < 1: fm.run_cmd( "bwa mem -t %(threads)s -R \"@RG\\tID:%(prefix)s\\tSM:%(prefix)s\\tPL:Illumina\" %(ref)s %(reads)s | samtools view -@ %(threads)s -b - | samtools fixmate -@ %(threads)s -m - - | samtools sort -@ %(threads)s - | samtools markdup -@ %(threads)s - %(prefix)s.mkdup.bam -" % vars(args)) if "trimmed" in vars(args) and args.single: fm.run_cmd("rm %(reads)s" % vars(args)) if "trimmed" in vars(args) and not args.single: fm.run_cmd( "rm %(prefix)s_1P %(prefix)s_2P %(prefix)s_1U %(prefix)s_2U" % vars(args)) fm.run_cmd("samtools index -@ %(threads)s %(prefix)s.mkdup.bam" % vars(args)) fm.run_cmd( "samtools flagstat -@ %(threads)s %(prefix)s.mkdup.bam > %(prefix)s.mkdup.bamstats" % vars(args)) if args.bqsr_vcf and (args.redo or args.step < 2): for vcf in args.bqsr_vcf.split(","): fm.tabix_vcf(vcf) args.bqsr_vcf = " ".join( ["--known-sites %s" % s for s in args.bqsr_vcf.split(",")]) fm.run_cmd( "gatk BaseRecalibrator -R %(ref)s -I %(prefix)s.mkdup.bam %(bqsr_vcf)s -O %(prefix)s.recal_data.table" % vars(args)) fm.run_cmd( "gatk ApplyBQSR -R %(ref)s -I %(prefix)s.mkdup.bam --bqsr-recal-file %(prefix)s.recal_data.table -O %(prefix)s.bqsr.bam" % vars(args)) fm.run_cmd("samtools index -@ %(threads)s %(prefix)s.bqsr.bam" % vars(args)) fm.run_cmd( "samtools flagstat -@ %(threads)s %(prefix)s.bqsr.bam > %(prefix)s.bqsr.bamstats" % vars(args)) fm.run_cmd("rm %(prefix)s.mkdup.bam*" % vars(args))
def main_import(args): FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w") params = vars(args) params["map_file"]= f"{args.prefix}.map" with open(params["map_file"],"w") as O: # Set up list to hold sample names samples = [] # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index for line in open(args.sample_file): sample = line.rstrip() vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}" sys.stderr.write(f"Looking for {vcf_file}") if os.path.isfile(vcf_file): sys.stderr.write("...OK\n") else: sys.stderr.write("...Not found...skipping\n") continue # filecheck(vcf_file) if args.ignore_missing and nofile(vcf_file): FAILED_SAMPLES.write("%s\tno_file\n" % sample) continue if nofile(f"{vcf_file}.validated"): if nofile(f"{vcf_file}.tbi"): run_cmd(f"tabix {vcf_file}") run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated") if nofile(f"{vcf_file}.validated"): FAILED_SAMPLES.write("%s\tno_validation\n" % sample) continue samples.append(sample) O.write("%s\t%s\n" % (sample,vcf_file)) if nofile(f"{vcf_file}.tbi"): run_cmd(f"bcftools index --tbi {vcf_file}") # Create .dict file (GATK fasta index) has been created for the reference if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params) # Create .fai file (SAMtools fasta index) has been created for the reference if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("samtools faidx %(ref)s" % params) window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params if nofile("%(prefix)s.dbconf.json" % params): import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2) json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w")) else: conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json"))) for l in cmd_out(window_cmd): row = l.strip().split() dirname = "%s_%s_genomics_db" % (args.prefix,row[1]) sys.stderr.write("Looking for direcotry named %s..." % dirname) foldercheck(dirname) sys.stderr.write("OK\n") import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
def main(args): vcf_class = fm.vcf(args.vcf) vcf_positions = vcf_class.get_positions() if not args.fasta: if not args.ref: sys.stderr.write( "\nERROR: Please supply a reference with --ref\n\n") quit() fm.run_cmd( "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" % vars(args)) args.fasta = "%s.snps.fa" % vcf_class.prefix if fm.nofile("%s.asr.state" % args.fasta): fm.run_cmd( "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr" % vars(args)) tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names states_file = "%s.asr.state" % args.fasta states = defaultdict(dict) sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] seqs = fm.fasta(args.fasta).fa_dict for site in tqdm(list(states)): for sample in seqs: states[site][sample] = seqs[sample][site - 1] acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"]) convergent_sites = [] for site in tqdm(list(states)): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objects origins = [] tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state in acgt and n.get_ancestors( )[0].state in acgt: origins.append(n.name) n.add_feature("state", node_state) if len(origins) > 1: convergent_sites.append((site, vcf_positions[site - 1], origins)) with open(args.out, "w") as O: for site in convergent_sites: O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def main(args): params = {"threads": args.threads, "prefix": args.prefix, "ref": args.ref, "map_file": f"{args.prefix}.map", "merged_file": args.merged_file, "include": args.include_regions, "vqslod": args.vqslod, "miss": args.missing_sample_cutoff, "mix":args.cutoff_mix_GT, "gff_file": args.gff_file} if args.include_regions: if not os.path.isfile("%(merged_file)s.tbi" % params): run_cmd("bcftools index -t %(merged_file)s" % params) params["vcf_in"] = params["merged_file"].replace(".genotyped.vcf.gz",".in.genotyped.vcf.gz") run_cmd("bcftools view -R %(include)s -O z -o %(vcf_in)s %(merged_file)s" % params) run_cmd("bcftools index -t %(vcf_in)s" % params) params["merged_file"] = params["vcf_in"] if not os.path.isfile(args.ref.replace(".fasta",".dict")): run_cmd("gatk CreateSequenceDictionary -R %s" % args.ref) for s in args.bqsr_vcf.split(","): if not os.path.isfile(s + ".tbi"): run_cmd("bcftools index -t %s" % s) if not os.path.isfile("%(merged_file)s.tbi" % params): run_cmd("bcftools index -t %(merged_file)s" % params) params["bqsr_vcf_mer"] = " ".join(["--resource:pf_crosses,known=false,training=true,truth=true,prior=15.0 %s " % s for s in args.bqsr_vcf.split(",")]) params["output"] = params["merged_file"].replace(".genotyped.vcf.gz",".recal") ## Calculating calibration model run_cmd("gatk VariantRecalibrator -R %(ref)s -V %(merged_file)s %(bqsr_vcf_mer)s -an QD -an FS -an SOR -an DP --max-gaussians 8 --mq-cap-for-logit-jitter-transform 70 -mode SNP -O %(prefix)s.snps.recal --tranches-file %(prefix)s.snps.tranches --rscript-file %(prefix)s.snps.plots.R" % params) run_cmd("gatk VariantRecalibrator -R %(ref)s -V %(merged_file)s %(bqsr_vcf_mer)s -an QD -an DP -an SOR -an FS --max-gaussians 4 --mq-cap-for-logit-jitter-transform 70 -mode INDEL -O %(prefix)s.indel.recal --tranches-file %(prefix)s.indel.tranches --rscript-file %(prefix)s.indel.plots.R" % params) ## Applying calibration model and obtaining VQSLOD run_cmd("gatk ApplyVQSR -R %(ref)s -V %(merged_file)s -O %(prefix)s.vqslod.snps.vcf.gz --truth-sensitivity-filter-level 99.0 --tranches-file %(prefix)s.snps.tranches --recal-file %(prefix)s.snps.recal -mode SNP" % params) run_cmd("gatk ApplyVQSR -R %(ref)s -V %(merged_file)s -O %(prefix)s.vqslod.indel.vcf.gz --truth-sensitivity-filter-level 99.0 --tranches-file %(prefix)s.indel.tranches --recal-file %(prefix)s.indel.recal -mode INDEL" % params) ## Filtering based on VQSLOD run_cmd("bcftools view -i 'VQSLOD>%(vqslod)s' -O z -o %(prefix)s.vqslod.filt.snps.vcf.gz %(prefix)s.vqslod.snps.vcf.gz" % params) run_cmd("bcftools view -i 'VQSLOD>%(vqslod)s' -O z -o %(prefix)s.vqslod.filt.indel.vcf.gz %(prefix)s.vqslod.indel.vcf.gz" % params) ## Annotating filtered files VQSLOD run_cmd("bcftools index -t %(prefix)s.vqslod.filt.snps.vcf.gz" % params) run_cmd("bcftools index -t %(prefix)s.vqslod.filt.indel.vcf.gz" % params) ## Add sample filtering by missing if params["miss"] == "0": run_cmd("mv %(prefix)s.vqslod.filt.snps.vcf.gz %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params) else: run_cmd("plink --vcf %(prefix)s.vqslod.filt.snps.vcf.gz --mind %(miss)s --recode vcf --allow-extra-chr --out %(prefix)s_plink" % params) run_cmd("grep -P \"^#CHROM\" %(prefix)s_plink.vcf | awk '{ $1=\"\"; $2=\"\";$3=\"\"; $4=\"\";$5=\"\"; $6=\"\";$7=\"\"; $8=\"\";$9=\"\"; print}' | sed 's/ /\\n/g' | tail -n+10 > %(prefix)s_new" % params) run_cmd("bcftools view -S %(prefix)s_new --threads 20 -O z -o %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz %(prefix)s.vqslod.filt.snps.vcf.gz" % params) ## Add set GT run_cmd("bcftools view %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz | setGT.py --fraction %(mix)s | bcftools view -O z -c 1 -o %(prefix)s.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params) ## Select only biallelic run_cmd("bcftools view -m2 -M2 %(prefix)s.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz --threads 20 -O z -o %(prefix)s.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params) ## Add CSQ annotation run_cmd("bcftools csq -p m -f %(ref)s -g %(gff_file)s %(prefix)s.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz -O z -o %(prefix)s.csq.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params)
def main(args): samples = [] reader = csv.DictReader(open(args.index_file)) if "sample" not in reader.fieldnames: reader = csv.DictReader(open(args.index_file, encoding='utf-8-sig')) for row in reader: if row["sample"] == "": continue samples.append(row["sample"]) fm.bwa_index(args.ref) fm.faidx(args.ref) cmd = "demultiplex_fastq.py --R1 %(read1)s --R2 %(read2)s --index %(index_file)s" % vars( args) if args.search_flipped_index: cmd += " --search-flipped-index" fm.run_cmd(cmd) for sample in samples: args.sample = sample fm.run_cmd( "bwa mem -t 10 -R \"@RG\\tID:%(sample)s\\tSM:%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -" % vars(args)) fm.run_cmd("samtools index %(sample)s.bam" % vars(args)) with open("bam_list.txt", "w") as O: for s in samples: O.write("%s.bam\n" % (s)) fm.run_cmd( "freebayes -f %(ref)s -L bam_list.txt --haplotype-length -1 --min-coverage 50 --min-base-quality %(min_base_qual)s --gvcf --gvcf-dont-use-chunk true | bcftools norm -f %(ref)s | bcftools sort -Oz -o combined.genotyped.vcf.gz" % vars(args)) fm.run_cmd( r"bcftools view -c 1 combined.genotyped.vcf.gz | bcftools query -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt" ) fm.run_cmd( r"bcftools query -f '%CHROM\t%POS[\t%DP]\n' combined.genotyped.vcf.gz > depth_info.txt" )
def main(args): chromosomes = list(range(1, 23)) # chromosomes = [1] snp_data = {} for f in tqdm([ "%s/chr%s.1kg.phase3.v5a.pvar" % (args.ref_dir, x) for x in chromosomes ]): for l in open(f): if l[0] == "#": continue row = l.strip().split() snp_data[row[2]] = row tmp_prefix = str(uuid.uuid4()) exclude_file = "%s.exclude.txt" % args.out new_bim_file = "%s.bim" % tmp_prefix log_file = "%s.fill_bim.log" % args.out EXCLUDE = open(exclude_file, "w") BIM = open(new_bim_file, "w") LOG = open(log_file, "w") for l in tqdm(open(args.bfile + ".bim")): row = l.strip().split() rid = row[1] ref_snp_data = snp_data[row[1]] if rid in snp_data else None if args.remove_exm and "ex" in rid: LOG.write("%s\tExcluded: Variant starts with exm\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") continue if row[4] != "0" and row[5] != "0": if row[4] != "I" and row[4] != "D": LOG.write("%s\tOK: No change\n" % row[1]) BIM.write("\t".join(row) + "\n") elif (row[5] == "I" or row[5] == "D") and ref_snp_data == None: LOG.write("%s\tExcluded: Indel not in ref\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif (row[4] == "I" or row[5] == "I") and ("," in ref_snp_data[4] or "," in ref_snp_data[3]): LOG.write("%s\tExcluded: More than one alt allele in ref\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif row[5] == "I" and len(ref_snp_data[3]) > 1: row[5] = ref_snp_data[3] row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) elif row[5] == "I" and len(ref_snp_data[4]) > 1: row[5] = ref_snp_data[4] row[4] = ref_snp_data[3] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) elif row[4] == "I" and len(ref_snp_data[4]) > 1: row[5] = ref_snp_data[3] row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) elif row[4] == "I" and len(ref_snp_data[3]) > 1: row[5] = ref_snp_data[4] row[4] = ref_snp_data[3] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) else: import pdb pdb.set_trace() elif row[4] == "0" and row[5] == "0": LOG.write("%s\tExcluded: No ref or alt present\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif row[4] == "0": if not ref_snp_data: LOG.write("%s\tExcluded: SNP not present in ref\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif set([ref_snp_data[3], ref_snp_data[4]]) == set([ "A", "T" ]) or set([ref_snp_data[3], ref_snp_data[4]]) == set(["C", "G"]): LOG.write("%s\tExcluded: Ambiguous ref/alt strand\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif "," in ref_snp_data[4] or "," in ref_snp_data[3]: LOG.write("%s\tExcluded: More than one alt allele in ref\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") elif row[5] == ref_snp_data[3]: row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tOK: All ref\n" % row[1]) elif row[5] == ref_snp_data[4]: row[4] = ref_snp_data[3] BIM.write("\t".join(row) + "\n") LOG.write("%s\tOK: All alt\n" % row[1]) elif c(row[5]) == ref_snp_data[3]: row[5] = ref_snp_data[3] row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFlipped to be ref\t\n" % row[1]) elif c(row[5]) == ref_snp_data[4]: row[5] = ref_snp_data[4] row[4] = ref_snp_data[3] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFlipped to be alt\t\n" % row[1]) elif row[5] == "I" and len(ref_snp_data[3]) > 1: row[5] = ref_snp_data[3] row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) elif row[5] == "D" and len(ref_snp_data[4]) > 1: row[5] = ref_snp_data[3] row[4] = ref_snp_data[4] BIM.write("\t".join(row) + "\n") LOG.write("%s\tFilled indel to ref\t\n" % row[1]) elif (row[5] != "I" and row[5] != "D") and len(ref_snp_data[3]) > 1: LOG.write("%s\tExcluded: Ref says indel but gt is SNP\n" % row[1]) BIM.write("\t".join(row) + "\n") EXCLUDE.write(row[1] + "\n") else: quit(row) else: quit(row) EXCLUDE.close() BIM.close() LOG.close() fm.run_cmd("cp %s.bed %s.bed" % (args.bfile, tmp_prefix)) fm.run_cmd("cp %s.fam %s.fam" % (args.bfile, tmp_prefix)) fm.run_cmd("plink --bfile %s --exclude %s --make-bed --out %s" % (tmp_prefix, exclude_file, args.out))