def main(args): if not args.bam: ps.log("Please provide bam file") quit() else: ps.filecheck(args.bam) if not args.ref: ps.log("Please provide reference") quit() else: ps.filecheck(args.ref) fasta = ps.fasta(args.ref) if not args.prefix: ps.log("Please provide prefix") quit() if args.gff and args.bed: ps.log( "Please provide either a GFF file or BED file but not both...Exiting!" ) quit() if args.gff: if not args.gffkey: ps.log( "Please provide the key to look for in the GFF file...Exiting!" ) quit() else: ps.filecheck(args.gff) if args.bed: ps.filecheck(args.bed) cov_json = "%s.cov.json" % args.prefix stats_json = "%s.bam_stats.json" % args.prefix region_json = "%s.regions.cov.json" % args.prefix stats = {} bamqc = ps.qc_bam(args.bam, args.ref) for s in fasta.fa_dict: cov_plot = "%s.%s.cov.png" % (args.prefix, s) bamqc.plot_cov(s, cov_plot) bamqc.save_cov(cov_json) stats["pct_reads_mapped"] = bamqc.pct_reads_mapped stats["med_dp"] = bamqc.med_dp region_cov = {} if args.gff: region_cov = bamqc.gff_cov(args.gff, args.gffkey) elif args.bed: region_cov = bamqc.bed_cov(args.bed) json.dump(stats, open(stats_json, "w")) json.dump(region_cov, open(region_json, "w"))
def main(args): if not args.bam: ps.log("Please provide bam file") quit() else: ps.filecheck(args.bam) if not args.ref: ps.log("Please provide reference") quit() else: ps.filecheck(args.ref) fasta = ps.fasta(args.ref) if not args.prefix: ps.log("Please provide prefix") quit() if args.gff and args.bed: ps.log("Please provide either a GFF file or BED file but not both...Exiting!") quit() if args.gff: if not args.gffkey: ps.log("Please provide the key to look for in the GFF file...Exiting!") quit() else: ps.filecheck(args.gff) if args.bed: ps.filecheck(args.bed) cov_json = "%s.cov.json" % args.prefix stats_json = "%s.bam_stats.json" % args.prefix region_json = "%s.regions.cov.json" % args.prefix stats = {} bamqc = ps.qc_bam(args.bam,args.ref) for s in fasta.fa_dict: cov_plot = "%s.%s.cov.png" % (args.prefix,s) bamqc.plot_cov(s,cov_plot) bamqc.save_cov(cov_json) stats["pct_reads_mapped"] = bamqc.pct_reads_mapped stats["med_dp"] = bamqc.med_dp region_cov = {} if args.gff: region_cov = bamqc.gff_cov(args.gff,args.gffkey) elif args.bed: region_cov = bamqc.bed_cov(args.bed) json.dump(stats,open(stats_json,"w")) json.dump(region_cov,open(region_json,"w"))
def main(args): ref = args.ref r1 = args.r1 r2 = args.r2 prefix = args.prefix threads = args.threads stats_file = "%s.stats.json" % prefix gc_file = "%s.gc_skew.json" % prefix cov_file = "%s.regions.cov.json" % prefix stats = OrderedDict() fq = ps.fastq(prefix, ref, r1, r2, threads=threads) fq_qc = fq.get_fastq_qc() if args.centrifuge: t1, t2 = fq_qc.run_centrifuge(args.centrifuge, False, threads) stats["centrifuge_top_hit"] = t1 stats["centrifuge_top_hit_num_reads"] = t2 stats["mean_read_len"] = fq_qc.mean_read_len stats["median_read_len"] = fq_qc.median_read_len stats["read_num"] = fq_qc.read_num bam = fq.illumina(mapper=args.mapper) if not args.nobamstats: bam_qc = bam.get_bam_qc() stats["med_dp"] = bam_qc.med_dp stats["pct_reads_mapped"] = bam_qc.pct_reads_mapped stats["genome_cov_1"] = bam_qc.genome_cov[1] stats["genome_cov_10"] = bam_qc.genome_cov[10] fasta = ps.fasta(ref).fa_dict for seq in fasta: cov_png = "%s.%s.cov.png" % (prefix, seq) bam_qc.plot_cov(seq, cov_png, primers=args.primers) if args.bed_cov: bam_qc.save_cov(cov_file, args.bed_cov) bam_qc.extract_gc_skew(gc_file) variants = bam.gbcf(primers=args.primers, chunk_size=args.window, call_method=args.call_method) bcfstats = variants.load_stats() stats["hom_variants"] = bcfstats["PSC"][prefix]["nNonRefHom"] stats["het_variants"] = bcfstats["PSC"][prefix]["nHets"] stats["hom_ref"] = bcfstats["PSC"][prefix]["nRefHom"] json.dump(stats, open(stats_file, "w"))
def main(args): ref = args.ref r1 = args.reads prefix = args.prefix threads = args.threads stats_file = "%s.stats.json" % prefix gc_file = "%s.gc_skew.json" % prefix cov_file = "%s.regions.cov.json" % prefix stats = OrderedDict() fq = ps.fastq(prefix, ref, r1, threads=threads) fq_qc = fq.get_fastq_qc() stats["mean_read_len"] = fq_qc.mean_read_len stats["median_read_len"] = fq_qc.median_read_len stats["read_num"] = fq_qc.read_num if args.centrifuge: t1, t2 = fq_qc.run_centrifuge(args.centrifuge, False, threads) stats["centrifuge_top_hit"] = t1 stats["centrifuge_top_hit_num_reads"] = t2 bam = fq.minION() bam_qc = bam.get_bam_qc() stats["med_dp"] = bam_qc.med_dp stats["pct_reads_mapped"] = bam_qc.pct_reads_mapped stats["genome_cov_1"] = bam_qc.genome_cov[1] stats["genome_cov_10"] = bam_qc.genome_cov[10] fasta = ps.fasta(ref).fa_dict for seq in fasta: cov_png = "%s.%s.cov.png" % (prefix, seq) bam_qc.plot_cov(seq, cov_png, primers=args.primers) bam_qc.extract_gc_skew(gc_file) if args.bed_cov: bam_qc.save_cov(cov_file, args.bed_cov) variants = bam.pileup2vcf(indels=False) bcf = bam.gbcf(threads=threads, primers=args.primers, chunk_size=args.window) bcf.generate_consensus(ref) bcfstats = bcf.load_stats() stats["hom_variants"] = bcfstats["PSC"][prefix]["nNonRefHom"] stats["het_variants"] = bcfstats["PSC"][prefix]["nHets"] stats["hom_ref"] = bcfstats["PSC"][prefix]["nRefHom"] json.dump(stats, open(stats_file, "w"))
def main(args): ref = args.ref r1 = args.r1 r2 = args.r2 prefix = args.prefix threads = args.threads stats_file = "%s.stats.json" % prefix gc_file = "%s.gc_skew.json" % prefix cov_file = "%s.regions.cov.json" % prefix stats = OrderedDict() fq = ps.fastq(prefix,ref,r1,r2,threads=threads) fq_qc = fq.get_fastq_qc() if args.centrifuge: t1,t2 = fq_qc.run_centrifuge(args.centrifuge,False,threads) stats["centrifuge_top_hit"] = t1 stats["centrifuge_top_hit_num_reads"] = t2 stats["mean_read_len"] = fq_qc.mean_read_len stats["median_read_len"] = fq_qc.median_read_len stats["read_num"] = fq_qc.read_num bam = fq.illumina(mapper=args.mapper) if not args.nobamstats: bam_qc = bam.get_bam_qc() stats["med_dp"] = bam_qc.med_dp stats["pct_reads_mapped"] = bam_qc.pct_reads_mapped stats["genome_cov_1"] = bam_qc.genome_cov[1] stats["genome_cov_10"] = bam_qc.genome_cov[10] fasta = ps.fasta(ref).fa_dict for seq in fasta: cov_png = "%s.%s.cov.png" % (prefix,seq) bam_qc.plot_cov(seq,cov_png,primers=args.primers) if args.bed_cov: bam_qc.save_cov(cov_file,args.bed_cov) bam_qc.extract_gc_skew(gc_file) variants = bam.gbcf(primers=args.primers,chunk_size=args.window,call_method=args.call_method) bcfstats = variants.load_stats() stats["hom_variants"] = bcfstats["PSC"][prefix]["nNonRefHom"] stats["het_variants"] = bcfstats["PSC"][prefix]["nHets"] stats["hom_ref"] = bcfstats["PSC"][prefix]["nRefHom"] json.dump(stats,open(stats_file,"w"))
args["fasta_db"] = sys.argv[5] args["threads"] = sys.argv[6] args["fq_report"] = ps.get_random_file() args["log"] = ps.get_random_file() cmd = "centrifuge -x %(centrifuge_db)s -1 %(r1)s -2 %(r2)s -S %(log)s --report-file %(fq_report)s -p %(threads)s" % args ps.run_cmd(cmd) best_ref = "" best_score = 0 best_species = "" best_species_score = 0 for l in open(args["fq_report"]): row = l.rstrip().split("\t") if row[4] == "numReads": continue if row[2] == "leaf" and int(row[4]) > best_score: best_ref = row[0] best_score = int(row[4]) elif row[2] == "species" and int(row[4]) > best_species_score: best_species = row[0] best_species_score = int(row[4]) fasta = ps.fasta(args["fasta_db"]).fa_dict open("%s.fa" % best_ref, "w").write(">%s\n%s\n" % (best_ref, fasta[best_ref])) ps.rm_files([args["fq_report"], args["log"]]) json.dump( { "sample": args["prefix"], "best_ref": best_ref, "best_species": best_species }, open("%(prefix)s.centrifuge.json" % args, "w"))
def main(args): fasta = ps.fasta(args.fasta) fasta.add_meta_data(args.data_file,args.outfile,args.delimiter)
def main(args): if args.bed: ps.split_bed(args.bed,args.size,reformat=args.reformat) else: fasta = ps.fasta(args.fasta) fasta.splitchr(args.size,reformat=args.reformat)
import sys import pathogenseq as ps infile = sys.argv[1] bcf = ps.bcf(infile) stats = bcf.load_stats() #stats = {'PSC': {'barcode07_run1_batch1': {'nHets': 798,'nNonRefHom': 38,'nRefHom': 276198}}} if len(sys.argv)>2: genome_len = sum([len(x) for x in ps.fasta(sys.argv[2]).fa_dict.values()]) if len(sys.argv)>2: print "sample\tnRefHom\tnNonRefHom\tnHets\tnMissing" else: print "sample\tnRefHom\tnNonRefHom\tnHets" for sample in stats["PSC"]: s = stats["PSC"][sample] if len(sys.argv)>2: tot_sum = s["nRefHom"]+s["nNonRefHom"]+s["nHets"] print "%s\t%s\t%s\t%s\t%s" % (sample,s["nRefHom"],s["nNonRefHom"],s["nHets"],genome_len-tot_sum) else: print "%s\t%s\t%s\t%s" % (sample,s["nRefHom"],s["nNonRefHom"],s["nHets"])
args["r1"] = sys.argv[1] args["r2"] = sys.argv[2] args["prefix"] = sys.argv[3] args["centrifuge_db"] = sys.argv[4] args["fasta_db"] = sys.argv[5] args["threads"] = sys.argv[6] args["fq_report"] = ps.get_random_file() args["log"] = ps.get_random_file() cmd = "centrifuge -x %(centrifuge_db)s -1 %(r1)s -2 %(r2)s -S %(log)s --report-file %(fq_report)s -p %(threads)s" % args ps.run_cmd(cmd) best_ref = "" best_score = 0 best_species = "" best_species_score = 0 for l in open(args["fq_report"]): row = l.rstrip().split("\t") if row[4]=="numReads": continue if row[2]=="leaf" and int(row[4])>best_score: best_ref = row[0] best_score = int(row[4]) elif row[2]=="species" and int(row[4])>best_species_score: best_species = row[0] best_species_score = int(row[4]) fasta = ps.fasta(args["fasta_db"]).fa_dict open("%s.fa" % best_ref,"w").write(">%s\n%s\n" % (best_ref,fasta[best_ref])) ps.rm_files([args["fq_report"],args["log"]]) json.dump({"sample":args["prefix"],"best_ref":best_ref,"best_species":best_species},open("%(prefix)s.centrifuge.json" % args,"w"))
def main(args): fasta = ps.fasta(args.fasta) fasta.add_meta_data(args.data_file, args.outfile, args.delimiter)
#! /usr/bin/env python import sys import pathogenseq as ps import json infile = sys.argv[1] ref = sys.argv[2] outfile = sys.argv[3] bcf = ps.bcf(infile) stats = bcf.load_stats(convert=True, ref=ref) genome_len = sum([len(x) for x in ps.fasta(ref).fa_dict.values()]) print("sample\tnRefHom\tnNonRefHom\tnHets\tnMissing") for sample in stats["PSC"]: s = stats["PSC"][sample] s["id"] = sample tot_sum = s["nRefHom"] + s["nNonRefHom"] + s["nHets"] s["missing"] = genome_len - tot_sum print("%s\t%s\t%s\t%s\t%s" % (sample, s["nRefHom"], s["nNonRefHom"], s["nHets"], s["missing"])) json.dump(s, open(outfile, "w"))
#! /usr/bin/env python import sys import pathogenseq as ps import json infile = sys.argv[1] ref = sys.argv[2] outfile = sys.argv[3] bcf = ps.bcf(infile) stats = bcf.load_stats(convert=True,ref=ref) genome_len = sum([len(x) for x in ps.fasta(ref).fa_dict.values()]) print("sample\tnRefHom\tnNonRefHom\tnHets\tnMissing") for sample in stats["PSC"]: s = stats["PSC"][sample] s["id"] = sample tot_sum = s["nRefHom"]+s["nNonRefHom"]+s["nHets"] s["missing"] = genome_len-tot_sum print("%s\t%s\t%s\t%s\t%s" % (sample,s["nRefHom"],s["nNonRefHom"],s["nHets"],s["missing"])) json.dump(s,open(outfile,"w"))