예제 #1
0
def main_lineage(args):
    conf_file = pp.filecheck(tbp._ROOT + "/../" + args.db + ".config.json")
    conf = json.load(open(conf_file))
    pp.filecheck(args.bcf)
    bcf = pp.bcf(args.bcf)
    mutations = bcf.get_bed_gt(conf["barcode"], conf["ref"])
    results = {}
    results["barcode"] = pp.barcode(mutations, conf["barcode"])
    tbp.barcode2lineage(results)
    if args.prefix:
        outfile = "%s.lineage.%s" % (args.prefix, args.outfmt)
        O = open(outfile, "w")
        if args.outfmt == "json":
            json.dump(results["lineage"], O)
        elif args.outfmt == "txt":
            O.write(tbp.text.lineagejson2text(results["lineage"]))
        O.close()
예제 #2
0
def main_profile(args):
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    if args.fasta:
        if args.read1 or args.read2:
            sys.stderr.write(
                "Please use --fasta or --read1/2 but not both... Exiting!\n")
            quit()
        fasta_obj = pp.fasta(args.fasta)
        wg_vcf_obj = pp.vcf(
            fasta_obj.get_ref_variants(conf["ref"],
                                       prefix=args.prefix,
                                       file_prefix=files_prefix))
    else:
        if not args.read1:
            sys.stderr.write(
                "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n"
            )
            quit()
        ### Create bam file if fastq has been supplied ###
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        wg_vcf_obj = bam_obj.call_variants(conf["ref"],
                                           args.caller,
                                           remove_missing=True)
        cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"],
                         wg_vcf_obj.samples[0],
                         wg_vcf_obj.prefix + ".consensus.fasta")
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]

    results = {}
    barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    barcode = pp.barcode(barcode_mutations, conf["barcode"])
    clade = ";".join(sorted([d["annotation"] for d in barcode]))
    sys.stdout.write("%s\t%s\n" % (args.prefix, clade))
    results["clade"] = clade

    variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"],
                                       conf["gff"], conf["proteins"])
    results["variants"] = variant_data

    json.dump(results, open("%s.results.json" % files_prefix, "w"))
예제 #3
0
def profile_vcf(filename, conf):
    params = conf.copy()
    params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz")
    params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz")
    params["filename"] = filename
    params["tmphdr"] = pp.get_random_file()
    params["tmptxt"] = pp.get_random_file()
    l = ""
    for l in pp.cmd_out(
            "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" %
            params):
        pass
    AD_found = False if l == "" else True
    if AD_found == False:
        open(params["tmphdr"], "w").write(
            "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n"
        )
        pp.run_cmd(
            "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s"
            % params)
        pp.run_cmd("bgzip %(tmptxt)s" % params)
        pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params)
        pp.run_cmd(
            "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s"
            % params)
    else:
        pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params)
    pp.run_cmd(
        "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s  -Oz -o %(tmpcsq)s -p a"
        % params)
    csq_bcf_obj = pp.bcf(params["tmpcsq"])
    csq = csq_bcf_obj.load_csq(ann_file=conf["ann"])
    results = {
        "variants": [],
        "missing_pos": [],
        "qc": {
            "pct_reads_mapped": "NA",
            "num_reads_mapped": "NA"
        }
    }
    for sample in csq:
        results["variants"] = csq[sample]
    all_bcf_obj = pp.bcf(params["tmpvcf"])
    mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][
            325505]["C"] == 50:
        mutations["Chromosome"][325505] = {"T": 25}
    if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][
            599868]["G"] == 50:
        mutations["Chromosome"][599868] = {"A": 25}
    if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][
            931123]["C"] == 50:
        mutations["Chromosome"][931123] = {"T": 25}
    if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][
            1759252]["T"] == 50:
        mutations["Chromosome"][1759252] = {"G": 25}
    json.dump(mutations, open("dump.json", "w"))
    barcode_mutations = pp.barcode(mutations, conf["barcode"])
    results["barcode"] = barcode_mutations
    results = pp.db_compare(db_file=conf["json_db"], mutations=results)
    bed_regions = pp.load_bed(conf["bed"], [4], 4)
    missing_regions = {gene: "NA" for gene in bed_regions}
    results["missing_regions"] = missing_regions
    if AD_found:
        pp.run_cmd("rm %(tmpcsq)s" % params)
    else:
        pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params)
    return results