def main_import(args): FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w") params = vars(args) params["map_file"]= f"{args.prefix}.map" with open(params["map_file"],"w") as O: # Set up list to hold sample names samples = [] # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index for line in open(args.sample_file): sample = line.rstrip() vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}" sys.stderr.write(f"Looking for {vcf_file}") if os.path.isfile(vcf_file): sys.stderr.write("...OK\n") else: sys.stderr.write("...Not found...skipping\n") continue # filecheck(vcf_file) if args.ignore_missing and nofile(vcf_file): FAILED_SAMPLES.write("%s\tno_file\n" % sample) continue if nofile(f"{vcf_file}.validated"): if nofile(f"{vcf_file}.tbi"): run_cmd(f"tabix {vcf_file}") run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated") if nofile(f"{vcf_file}.validated"): FAILED_SAMPLES.write("%s\tno_validation\n" % sample) continue samples.append(sample) O.write("%s\t%s\n" % (sample,vcf_file)) if nofile(f"{vcf_file}.tbi"): run_cmd(f"bcftools index --tbi {vcf_file}") # Create .dict file (GATK fasta index) has been created for the reference if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params) # Create .fai file (SAMtools fasta index) has been created for the reference if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("samtools faidx %(ref)s" % params) window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params if nofile("%(prefix)s.dbconf.json" % params): import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2) json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w")) else: conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json"))) for l in cmd_out(window_cmd): row = l.strip().split() dirname = "%s_%s_genomics_db" % (args.prefix,row[1]) sys.stderr.write("Looking for direcotry named %s..." % dirname) foldercheck(dirname) sys.stderr.write("OK\n") import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
def __init__(self, filename, threads=4): self.samples = [] self.filename = filename self.threads = threads self.prefix = get_vcf_prefix(filename) if nofile(filename + ".csi"): run_cmd("bcftools index %(filename)s" % vars(self)) self.temp_file = get_random_file() run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self)) for l in open(self.temp_file): self.samples.append(l.rstrip()) os.remove(self.temp_file)
def main(args): vcf_class = fm.vcf(args.vcf) vcf_positions = vcf_class.get_positions() if not args.fasta: if not args.ref: sys.stderr.write( "\nERROR: Please supply a reference with --ref\n\n") quit() fm.run_cmd( "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" % vars(args)) args.fasta = "%s.snps.fa" % vcf_class.prefix if fm.nofile("%s.asr.state" % args.fasta): fm.run_cmd( "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr" % vars(args)) tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names states_file = "%s.asr.state" % args.fasta states = defaultdict(dict) sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] seqs = fm.fasta(args.fasta).fa_dict for site in tqdm(list(states)): for sample in seqs: states[site][sample] = seqs[sample][site - 1] acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"]) convergent_sites = [] for site in tqdm(list(states)): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objects origins = [] tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state in acgt and n.get_ancestors( )[0].state in acgt: origins.append(n.name) n.add_feature("state", node_state) if len(origins) > 1: convergent_sites.append((site, vcf_positions[site - 1], origins)) with open(args.out, "w") as O: for site in convergent_sites: O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def main(args): samples = [] for l in open(args.samples): samples = [x.rstrip() for x in open(args.samples).readlines()] for s in samples: fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension)) if fm.nofolder("%(dir)s/kraken" % vars(args)): fm.run_cmd("%(dir)s/kraken" % vars(args)) args.cmd_file = fm.get_random_file() with open(args.cmd_file, "w") as O: for s in samples: args.sample = s if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" % vars(args)): O.write( "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n" % (s, args.dir, s, args.alignment_extension, args.dir, s)) if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)): O.write( "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n" % vars(args)) fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" % vars(args), verbose=2) fm.rm_files([args.cmd_file]) sample_metrics = [] for s in samples: res = {"sample": s} args.sample = s for i, l in enumerate( open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" % vars(args))): row = l.rstrip().split() if i in [2, 3]: res[row[3]] = int(row[0]) elif i == 4: res[row[3]] = int(row[0]) res["mapped_percent"] = float(row[4].replace("(", "").replace( "%", "")) else: pass kraken_results = {} for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)): row = l.strip().split() if row[3] not in kraken_results: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) if float(row[0]) > kraken_results[row[3]][0]: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1], kraken_results["G"][0]) res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1], kraken_results["G1"][0]) res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1], kraken_results["S"][0]) tbprofiler_result = json.load( open("%(dir)s/tbprofiler/results/%(sample)s.results.json" % vars(args))) res["lineage"] = tbprofiler_result["main_lin"] res["sub-lineage"] = tbprofiler_result["sublin"] res["drtype"] = tbprofiler_result["drtype"] tmp_drugs = defaultdict(list) for var in tbprofiler_result["dr_variants"]: for d in var["drugs"]: tmp_drugs[d["drug"]].append( "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"])) for d in drugs: res[d] = ", ".join(tmp_drugs[d]) sample_metrics.append(res) with open(args.out + ".sample_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0])) writer.writeheader() writer.writerows(sample_metrics) vcf = fm.vcf_class(args.vcf) if fm.nofile(args.vcf + ".stats.txt"): fm.run_cmd( "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt" % (vars(args))) vcf_stats = vcf.load_stats() results = { "number of samples": vcf_stats["number of samples"], "number of records": vcf_stats["number of records"], "number of SNPs": vcf_stats["number of SNPs"], "number of indels": vcf_stats["number of indels"], } snp_results = [] if fm.nofile(args.vcf + ".csq_info.txt"): fm.run_cmd( "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt" % vars(args)) fm.run_cmd( "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt" % vars(args)) variant_info = vcf.get_variant_data(args.ref, args.gff) with open(args.out + ".variant_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(variant_info[0])) writer.writeheader() writer.writerows(variant_info)
def main(args): if nofile(args.vcf): quit("Can't find %s... Exiting!" % args.vcf) vcf = vcf_class(args.vcf) vcf.vcf_to_matrix(args.no_iupacgt)
def main(args): if nofile(args.vcf): quit("Can't find %s... Exiting!" % args.vcf) vcf = vcf_class(args.vcf) vcf.filter_by_af(args.maf, args.pop_file)