示例#1
0
def main_import(args):
    FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w")
    params = vars(args)
    params["map_file"]= f"{args.prefix}.map"

    with open(params["map_file"],"w") as O:
        # Set up list to hold sample names
        samples = []
        # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index
        for line in open(args.sample_file):
            sample = line.rstrip()
            vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}"
            sys.stderr.write(f"Looking for {vcf_file}")
            if os.path.isfile(vcf_file):
                sys.stderr.write("...OK\n")
            else:
                sys.stderr.write("...Not found...skipping\n")
                continue
            # filecheck(vcf_file)
            if args.ignore_missing and nofile(vcf_file):
                FAILED_SAMPLES.write("%s\tno_file\n" % sample)
                continue
            if nofile(f"{vcf_file}.validated"):
                if nofile(f"{vcf_file}.tbi"):
                    run_cmd(f"tabix {vcf_file}")
                run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated")
                if nofile(f"{vcf_file}.validated"):
                    FAILED_SAMPLES.write("%s\tno_validation\n" % sample)
                    continue
            samples.append(sample)
            O.write("%s\t%s\n" % (sample,vcf_file))
            if nofile(f"{vcf_file}.tbi"):
                run_cmd(f"bcftools index --tbi {vcf_file}")
    # Create .dict file (GATK fasta index) has been created for the reference
    if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params)
    # Create .fai file (SAMtools fasta index) has been created for the reference
    if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("samtools faidx %(ref)s" % params)

    window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params
    if nofile("%(prefix)s.dbconf.json" % params):
        import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
        json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w"))
    else:
        conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json")))
        for l in cmd_out(window_cmd):
            row = l.strip().split()
            dirname = "%s_%s_genomics_db" % (args.prefix,row[1])
            sys.stderr.write("Looking for direcotry named %s..." % dirname)
            foldercheck(dirname)
            sys.stderr.write("OK\n")
        import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
示例#2
0
 def __init__(self, filename, threads=4):
     self.samples = []
     self.filename = filename
     self.threads = threads
     self.prefix = get_vcf_prefix(filename)
     if nofile(filename + ".csi"):
         run_cmd("bcftools index  %(filename)s" % vars(self))
     self.temp_file = get_random_file()
     run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self))
     for l in open(self.temp_file):
         self.samples.append(l.rstrip())
     os.remove(self.temp_file)
def main(args):

    vcf_class = fm.vcf(args.vcf)
    vcf_positions = vcf_class.get_positions()

    if not args.fasta:
        if not args.ref:
            sys.stderr.write(
                "\nERROR: Please supply a reference with --ref\n\n")
            quit()
        fm.run_cmd(
            "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" %
            vars(args))
        args.fasta = "%s.snps.fa" % vcf_class.prefix
    if fm.nofile("%s.asr.state" % args.fasta):
        fm.run_cmd(
            "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr"
            % vars(args))

    tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    states_file = "%s.asr.state" % args.fasta
    states = defaultdict(dict)
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]

    seqs = fm.fasta(args.fasta).fa_dict
    for site in tqdm(list(states)):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"])
    convergent_sites = []
    for site in tqdm(list(states)):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objects
        origins = []

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state in acgt and n.get_ancestors(
            )[0].state in acgt:
                origins.append(n.name)
            n.add_feature("state", node_state)
        if len(origins) > 1:
            convergent_sites.append((site, vcf_positions[site - 1], origins))

    with open(args.out, "w") as O:
        for site in convergent_sites:
            O.write("%s\t%s\n" % (site[1][1], len(site[2])))
示例#4
0
def main(args):
    samples = []
    for l in open(args.samples):
        samples = [x.rstrip() for x in open(args.samples).readlines()]

    for s in samples:
        fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension))

    if fm.nofolder("%(dir)s/kraken" % vars(args)):
        fm.run_cmd("%(dir)s/kraken" % vars(args))

    args.cmd_file = fm.get_random_file()
    with open(args.cmd_file, "w") as O:
        for s in samples:
            args.sample = s

            if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" %
                         vars(args)):
                O.write(
                    "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n"
                    % (s, args.dir, s, args.alignment_extension, args.dir, s))

            if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)):
                O.write(
                    "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n"
                    % vars(args))

    fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" %
               vars(args),
               verbose=2)
    fm.rm_files([args.cmd_file])

    sample_metrics = []
    for s in samples:
        res = {"sample": s}
        args.sample = s
        for i, l in enumerate(
                open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" %
                     vars(args))):
            row = l.rstrip().split()
            if i in [2, 3]:
                res[row[3]] = int(row[0])
            elif i == 4:
                res[row[3]] = int(row[0])
                res["mapped_percent"] = float(row[4].replace("(", "").replace(
                    "%", ""))
            else:
                pass

        kraken_results = {}
        for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)):
            row = l.strip().split()
            if row[3] not in kraken_results:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
            if float(row[0]) > kraken_results[row[3]][0]:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
        res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1],
                                             kraken_results["G"][0])
        res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1],
                                              kraken_results["G1"][0])
        res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1],
                                               kraken_results["S"][0])

        tbprofiler_result = json.load(
            open("%(dir)s/tbprofiler/results/%(sample)s.results.json" %
                 vars(args)))
        res["lineage"] = tbprofiler_result["main_lin"]
        res["sub-lineage"] = tbprofiler_result["sublin"]
        res["drtype"] = tbprofiler_result["drtype"]
        tmp_drugs = defaultdict(list)
        for var in tbprofiler_result["dr_variants"]:
            for d in var["drugs"]:
                tmp_drugs[d["drug"]].append(
                    "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"]))
        for d in drugs:
            res[d] = ", ".join(tmp_drugs[d])

        sample_metrics.append(res)

    with open(args.out + ".sample_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0]))
        writer.writeheader()
        writer.writerows(sample_metrics)

    vcf = fm.vcf_class(args.vcf)
    if fm.nofile(args.vcf + ".stats.txt"):
        fm.run_cmd(
            "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt"
            % (vars(args)))

    vcf_stats = vcf.load_stats()

    results = {
        "number of samples": vcf_stats["number of samples"],
        "number of records": vcf_stats["number of records"],
        "number of SNPs": vcf_stats["number of SNPs"],
        "number of indels": vcf_stats["number of indels"],
    }

    snp_results = []
    if fm.nofile(args.vcf + ".csq_info.txt"):
        fm.run_cmd(
            "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt"
            % vars(args))
        fm.run_cmd(
            "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt"
            % vars(args))

    variant_info = vcf.get_variant_data(args.ref, args.gff)
    with open(args.out + ".variant_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(variant_info[0]))
        writer.writeheader()
        writer.writerows(variant_info)
示例#5
0
def main(args):
    if nofile(args.vcf): quit("Can't find %s... Exiting!" % args.vcf)
    vcf = vcf_class(args.vcf)
    vcf.vcf_to_matrix(args.no_iupacgt)
def main(args):
    if nofile(args.vcf): quit("Can't find %s... Exiting!" % args.vcf)
    vcf = vcf_class(args.vcf)
    vcf.filter_by_af(args.maf, args.pop_file)