Пример #1
0
def main(args):
    if args.prefix:
        individual_bams = ["%s/%s%s" % (args.dir,run,args.suffix) for run in args.prefix.split("_")]
        new_id = args.new_id if args.new_id else args.prefix
    elif args.bams:
        individual_bams = args.bams.split(",")
        new_id = args.new_id if args.new_id else "_".join([bam.split("/")[-1].replace(args.suffix,"") for bam in individual_bams])
    elif (not args.prefix and not args.bams) or (args.prefix and args.bams):
        sys.stderr.write("Need wither '--bams' or '--prefix'... Exiting!\n")
        quit()
    if len(individual_bams)==1:
        sys.stderr.write("Need more than one bam... Exiting!\n")
        quit()
    for bam in individual_bams:
        fm.filecheck(bam)
    new_bamfile = "%s/%s%s" % (args.dir,new_id,args.suffix)
    tmp_bamfile = fm.get_random_file()
    tmp_file = fm.get_random_file()
    with open(tmp_file,"w") as O:
        for l in fm.cmd_out("samtools view -H %s" % individual_bams[0]):
            row = l.strip().split("\t")
            if row[0]=="@RG":
                continue
                row[1] = "ID:%s" % new_id
                row[2] = "SM:%s" % new_id
            O.write("%s\n" % "\t".join(row))

    fm.run_cmd("samtools merge -@ %s - %s | samtools reheader -i %s - | samtools addreplacerg -@ %s - -r 'ID:%s\\tSM:%s\\tPL:Illumina' -o %s" % (
        args.threads," ".join(individual_bams), tmp_file, args.threads,new_id, new_id, new_bamfile)
    )
    fm.run_cmd("samtools index %s" % new_bamfile)
    fm.rm_files([tmp_file,tmp_bamfile])
Пример #2
0
def main_import(args):
    FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w")
    params = vars(args)
    params["map_file"]= f"{args.prefix}.map"

    with open(params["map_file"],"w") as O:
        # Set up list to hold sample names
        samples = []
        # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index
        for line in open(args.sample_file):
            sample = line.rstrip()
            vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}"
            sys.stderr.write(f"Looking for {vcf_file}")
            if os.path.isfile(vcf_file):
                sys.stderr.write("...OK\n")
            else:
                sys.stderr.write("...Not found...skipping\n")
                continue
            # filecheck(vcf_file)
            if args.ignore_missing and nofile(vcf_file):
                FAILED_SAMPLES.write("%s\tno_file\n" % sample)
                continue
            if nofile(f"{vcf_file}.validated"):
                if nofile(f"{vcf_file}.tbi"):
                    run_cmd(f"tabix {vcf_file}")
                run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated")
                if nofile(f"{vcf_file}.validated"):
                    FAILED_SAMPLES.write("%s\tno_validation\n" % sample)
                    continue
            samples.append(sample)
            O.write("%s\t%s\n" % (sample,vcf_file))
            if nofile(f"{vcf_file}.tbi"):
                run_cmd(f"bcftools index --tbi {vcf_file}")
    # Create .dict file (GATK fasta index) has been created for the reference
    if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params)
    # Create .fai file (SAMtools fasta index) has been created for the reference
    if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("samtools faidx %(ref)s" % params)

    window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params
    if nofile("%(prefix)s.dbconf.json" % params):
        import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
        json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w"))
    else:
        conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json")))
        for l in cmd_out(window_cmd):
            row = l.strip().split()
            dirname = "%s_%s_genomics_db" % (args.prefix,row[1])
            sys.stderr.write("Looking for direcotry named %s..." % dirname)
            foldercheck(dirname)
            sys.stderr.write("OK\n")
        import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
def main(args):
    check_programs(["taxonkit", "seqtk"])
    if not os.path.isdir(
            "%s/.taxonkit/" % os.path.expanduser("~")) or not os.path.isfile(
                "%s/.taxonkit/nodes.dmp" % os.path.expanduser("~")):
        download_files()
    nodes = set()
    sys.stderr.write("Loading taxonomy\n")
    cmd = "taxonkit list --ids %s" % (args.extract
                                      if args.extract else args.exclude)
    for l in fm.cmd_out(cmd):
        if l == "": continue
        row = l.strip().split()
        nodes.add(row[0])

    sys.stderr.write("Extracting read names\n")
    args.tmp_file = str(uuid4())
    total_reads = 0
    kept_reads = 0

    with open(args.tmp_file, "w") as O:
        if args.exclude:
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                total_reads += 1
                row = l.strip().split()
                if row[2] not in nodes:
                    O.write("%s\n" % row[1])
                    kept_reads += 1
        else:
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                total_reads += 1
                row = l.strip().split()
                if row[2] in nodes:
                    O.write("%s\n" % row[1])
                    kept_reads += 1

    sys.stderr.write("Writing filtered fastq files\n")
    fm.filecheck(args.R1)
    args.R1_filt = args.R1.replace(".fastq.gz", "").replace(
        ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
    fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" %
               vars(args))

    if args.R2:
        fm.filecheck(args.R2)
        args.R2_filt = args.R2.replace(".fastq.gz", "").replace(
            ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
        fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" %
                   vars(args))

    fm.rm_files([args.tmp_file])
    sys.stderr.write("\nKept %s/%s reads\n" % (kept_reads, total_reads))
Пример #4
0
def main_genotype(args):
    conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json")))
    params = vars(args)
    params["num_genome_chunks"] = conf["num_genome_chunks"]
    window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params
    params["window_cmd"] = window_cmd
    # Check folders exist
    for l in cmd_out(window_cmd):
        row = l.strip().split()
        dirname = "%s_%s_genomics_db" % (args.prefix,row[1])
        sys.stderr.write("Looking for direcotry named %s..." % dirname)
        foldercheck(dirname)
        sys.stderr.write("OK\n")

    genotype_cmd = "gatk --java-options \"-Xmx40g\" GenotypeGVCFs -R %(ref)s -V gendb://%(prefix)s_{2}_genomics_db -O %(prefix)s.{2}.genotyped.vcf.gz" % params
    run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {genotype_cmd}",verbose=2)
    run_cmd("bcftools concat -Oz -o %(prefix)s.%(subfix_vcf)s.genotyped.vcf.gz `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz\"}'`" % params)
    run_cmd("rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz*\"}'`" % params)
Пример #5
0
	def get_genesum(self,outfile=None):
		self.outfile = outfile
		if self.is_annotated()==False:
			quit(bcolors.FAIL + "\nError: VCF file has not been annotated with bcftools csq, please do this before running this script\n" + bcolors.ENDC)
		if self.outfile==None:
			self.outfile = self.prefix+".genesum"
		genesum = defaultdict(lambda:defaultdict(int))
		O = open(self.outfile,"w")
		for l in tqdm(fm.cmd_out("bcftools query -f '[%%SAMPLE\\t%%GT\\t%%TBCSQ\\n]' %(filename)s" % vars(self))):
			row = l.split()
			#por4A	1/1	synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A	synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A
			info = row[2].split("|")
			if info[0]=="synonymous": continue
			if info[0][0]=="@": continue
			genesum[info[1]][row[0]]+=1
		for gene in genesum:
			O.write("%s\tNA\tNA\t%s\n" % (gene,"\t".join(str(genesum[gene][s]) for s in self.samples)))
		O.close()
Пример #6
0
	def get_mean_genotype(self,outfile=None):
		self.outfile = outfile
		if self.outfile==None:
			self.outfile = self.prefix+".geno"
		O = open(self.outfile,"w")
		for l in tqdm(fm.cmd_out("bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%TGT]\\n' %(filename)s" % vars(self))):
			row = l.rstrip().split()
			alts = row[3].split(",")
			for alt in alts:
				ref = "%s/%s" % (row[2],row[2])
				tmp = "%s/%s" % (alt,alt)
				genos = []
				for x in row[4:]:
					if x==ref:
						genos.append("0")
					elif x==tmp:
						genos.append("1")
					else:
						genos.append("NA")
				O.write("%s, %s, %s, %s\n" % (row[0]+"_"+row[1]+"_"+alt,row[2],alt,", ".join(genos)))
		O.close()
Пример #7
0
def main(args):
    args.region_arg = ""

    if args.variant_caller == "gatk":
        if args.bed:
            args.region_arg = "-L %s" % args.bed
        fm.run_cmd(
            "gatk HaplotypeCaller -R %(ref)s %(region_arg)s -I %(bam)s -O %(out)s.vcf.gz"
            % vars(args))
    elif args.variant_caller == "bcftools":
        if args.bed:
            args.region_arg = "-R %s" % args.bed
        fm.run_cmd(
            "bcftools mpileup -f %(ref)s %(region_arg)s %(bam)s | bcftools call -mv -Oz -o %(out)s.vcf.gz"
            % vars(args))
    elif args.variant_caller == "freebayes":
        if args.bed:
            args.region_arg = "-t %s" % args.bed
        fm.run_cmd(
            "freebayes -f %(ref)s %(region_arg)s %(bam)s | bgzip -c > %(out)s.vcf.gz"
            % vars(args))
    else:
        quit("Unknown variant caller! Exiting!")

    fm.run_cmd("tabix -f %(out)s.vcf.gz" % vars(args))

    if args.bed:
        fm.run_cmd(
            "bedtools coverage -a %(bed)s -b %(bam)s -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2+$(NF-1)-2\"\\t\"$2+$(NF-1)-1}' > %(out)s.depth_mask.bed"
            % vars(args))
    else:
        fm.run_cmd(
            "bedtools genomecov -ibam %(bam)s  -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2-1\"\\t\"$2}' > %(out)s.depth_mask.bed"
            % vars(args))

    for l in fm.cmd_out("wc -l %(out)s.depth_mask.bed" % vars(args)):
        num_lines = int(l.strip().split()[0])

    args.mask_arg = "-m %(out)s.depth_mask.bed -M N" % vars(
        args) if num_lines > 0 else ""

    region_names = {}
    if args.bed:
        regions_file = args.out + ".regions.txt"
        with open(regions_file, "w") as O:
            for l in open(args.bed):
                row = l.strip().split()
                r = "%s:%s-%s" % (row[0], row[1], row[2])
                O.write(r + "\n")
                if len(row) > 3:
                    region_names[r] = row[3]

        args.region_arg = "-r %s" % regions_file
        consensus_cmd = "samtools faidx %(ref)s %(region_arg)s | bcftools consensus %(out)s.vcf.gz %(mask_arg)s" % vars(
            args)
    else:
        consensus_cmd = "bcftools consensus -f %(ref)s %(out)s.vcf.gz %(mask_arg)s" % vars(
            args)

    with open(args.out + ".consensus.fa", "w") as O:
        for l in fm.cmd_out(consensus_cmd):
            if l[0] == ">":
                r = l.strip()[1:]
                O.write(">%s %s\n" % (args.out, region_names.get(r, r)))
            else:
                O.write(l + "\n")
Пример #8
0
	def is_annotated(self):
		check = False
		for l in fm.cmd_out(f"bcftools view -h {self.filename}"):
			if "##INFO=<ID=BCSQ" in l:
				check = True
		return check