示例#1
0
def main_load_library(args):
    lib_prefix = args.prefix.split("/")[-1]
    files = {
        "gff": ".gff",
        "ref": ".fasta",
        "barcode": ".barcode.bed",
        "version": ".version.json",
        "proteins": ".proteins.csv",
        "non_coding_bed": ".non_coding.bed"
    }
    if pp.nofolder(sys.base_prefix + "/share/covidprofiler"):
        pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/"))
    for key in files:
        new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[
            key]
        pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location))
    pp.run_cmd("samtools faidx %s" % sys.base_prefix +
               "/share/covidprofiler/" + lib_prefix + ".fasta")
    pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" +
               lib_prefix + ".fasta")
    if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" +
                      lib_prefix + ".dict"):
        pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" +
                   lib_prefix + ".dict")
    pp.log("Sucessfully imported library")
示例#2
0
def run_fuzznuc(seqs, pattern, pmismatch=0):
    tmpfile = pp.get_random_file()
    pp.run_cmd(
        "fuzznuc -sequence %s -pattern %s -outfile %s -complement -pmismatch %s"
        % (seqs, pattern, tmpfile, pmismatch))
    result = parse_fuzznuc_output(tmpfile)
    pp.rm_files([tmpfile])
    return result
示例#3
0
def get_variant_data(vcf_file,ref_file,gff_file,protein_file):
    nsp_data = {}
    gene_info = {}
    for row in csv.DictReader(open(protein_file)):
        row["Start"] = int(row["Start"])
        row["End"] = int(row["End"])
        gene_info[row["Gene"]] = {"function":row["Putative function"],"DOI":row["DOI"]}
        if row["Region"]!="nsp": continue
        for i in range(row["Start"],row["End"]+1):
            nsp_data[i] = row

    pp.run_cmd("samtools faidx %s" % ref_file)
    results = defaultdict(list)
    for l in pp.cmd_out("bcftools view %s | bcftools csq -f %s -g %s -p a  | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file,ref_file,gff_file)):
        # Replace " " with "N" because if the alt allele contains N then
        # translated consequence will have spaces
        row = l.strip().replace(" ","N").split()
        pos,ref,alts_str,af_str,csq_str = row
        alt_af = sum([float(x) for x in af_str.split(",")])
        csqs = csq_str.split(",")
        types = []
        changes = []
        genes = []
        pos = int(pos)
        for i in range(len(csqs)):
            if csqs[i][0]=="@":
                # results[pos].append(results[int(csqs[i][1:])][0])
                pass
            elif csqs[i]==".":
                results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":"intergenic","changes":"NA","gene":"NA","gene_function":"NA","gene_reference":"NA"})

            else:
                csq = csqs[i].split("|")
                types.append(csq[0].replace("*",""))
                if csq[1]=="orf1ab":
                    codon_pos = get_codon_num(csq[5])
                    if codon_pos in nsp_data:
                        genes.append(nsp_data[codon_pos]["Gene"])
                        codon_pos = codon_pos-nsp_data[codon_pos]["Start"]+1
                        changes.append(change_codon_number(csq[5],codon_pos))
                    else:
                        genes.append("orf1ab")
                        changes.append(csq[5])
                else:
                    changes.append(csq[5] if len(csq)>5 else "")
                    genes.append(csq[1])
                if len(set(types))==1:
                    types = list(set(types))
                results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":",".join(types), "changes":",".join(changes),"gene":genes[0], "gene_function":gene_info[genes[0]]["function"], "gene_reference":gene_info[genes[0]]["DOI"]})
    final_results = []
    for res in list(results.values()):
        for r in res:
            final_results.append(r)
        # if len(res)==1:
        #     final_results.append(res[0])
        # else:
        #     quit("ERROR! more than one variant for a position")
    return final_results
def index_bcf(bcffile, threads=1, overwrite=False):
    """
    Indexing a bam file
    """
    cmd = "bcftools index --threads %s -f %s" % (threads, bcffile)
    if filecheck(bcffile):
        if nofile(bcffile + ".csi"):
            pp.run_cmd(cmd)
        elif os.path.getmtime(bcffile +
                              ".csi") < os.path.getmtime(bcffile) or overwrite:
            pp.run_cmd(cmd)
def fasta2vcf(fasta_file, outfile):
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    refseq = pp.fasta(conf["ref"]).fa_dict
    seqs = pp.fasta(fasta_file)
    samples = list(seqs.fa_dict.keys())

    for sample in samples:
        fname = pp.get_random_file()
        open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample]))
        fasta_obj = pp.fasta(fname)
        vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample))
        pp.run_cmd("rm %s" % fname)

    sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)]
    tmp_vcfs = []
    for tmp_samps in sample_chunks:
        tmp_list = pp.get_random_file()
        tmp_vcf = pp.get_random_file()
        open(tmp_list,
             "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps]))
        pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf))
        pp.run_cmd("bcftools index %s" % tmp_vcf)
        tmp_vcfs.append(tmp_vcf)
        pp.rm_files([tmp_list])

    pp.run_cmd("bcftools merge -0  %s | bcftools view -V indels -Oz -o %s" %
               (" ".join(tmp_vcfs), outfile))

    vcf_files = ["%s.vcf.gz" % s for s in samples]
    vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples]
    pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
示例#6
0
def main_aln(args):
    """
    mafft --auto --thread -1 --keeplength --addfragments gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.fasta ~/covid/cvdb.fasta > gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln
python ~/gisaid_scripts/get_fasta_stats.py  --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln  --bed ~/covid/static/coding.bed --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.stats
awk '$3<=2.5 && $4<=3 && $5<=50' gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln.stats | cut -f1 > seq_filtered_samples.txt
seqtk subseq gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln seq_filtered_samples.txt > gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.aln
python ~/gisaid_scripts/mask_fasta.py  --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.aln --bed ~/covid/static/non_coding_mask.bed --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.aln
python ~/gisaid_scripts/mask_fasta_non_acgt.py --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.aln --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.aln
snp-sites -v gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.aln  | python ~/gisaid_scripts/vcf_fix_ref.py --ref ~/covid/cvdb.fasta | python ~/gisaid_scripts/vcf_mask_non_acgt.py  | tqdm | bcftools view -a -Oz -o gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.vcf.gz
bcftools norm --threads 4 -m - gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.vcf.gz -Oz -o gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.multi_split.vcf.gz

    """
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    pp.run_cmd(
        "mafft --auto --thread %s --keeplength --addfragments %s %s  > %s.aln"
        % (args.threads, args.fasta, conf["ref"], args.prefix))
    pp.run_cmd(
        "covid_profiler_mask_fasta.py  --fasta %s.aln --bed %s --out %s.bed_masked.aln"
        % (args.prefix, conf["non_coding_bed"], args.prefix))
    pp.run_cmd(
        "covid_profiler_mask_fasta_non_acgt.py --fasta %s.bed_masked.aln --out %s.bed_masked.acgt.aln"
        % (args.prefix, args.prefix))
    pp.run_cmd(
        "iqtree -m GTR+F+R2 -s %s.bed_masked.acgt.aln -nt %s -czb -pre %s" %
        (args.prefix, args.threads, args.prefix))
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.vcf_suffix, "") for x in os.listdir(args.vcf_dir)
            if x[-len(args.vcf_suffix):] == args.vcf_suffix
        ]

    for l in open(conf["gff"]):
        row = l.strip().split()
        if len(row) <= 2: continue
        if row[2] != "gene": continue
        if "Name=%s" % args.gene in l or "gene:%s" % args.gene in l:
            break

    start, end = int(row[3]), int(row[4])
    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        if not os.path.isfile("%s/%s%s" % (args.dir, s, args.suffix)): continue
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        vars = json.dumps([
            d for d in data["dr_variants"] + data["other_variants"]
            if d["locus_tag"] == args.gene
        ])
        print(vars)
        if "deletion" not in vars and "frameshift" not in vars and "inframe" not in vars and "stop" not in vars and "start" not in vars:

            revseq = "| revseq  -sequence /dev/stdin  -outseq /dev/stdout" if row[
                6] == "-" else ""
            pp.run_cmd(
                "samtools faidx %s Chromosome:%s-%s | bcftools consensus %s/%s%s %s | sed 's/^>.*/>%s/' > %s.%s.fasta"
                % (conf["ref"], start, end, args.vcf_dir, s, args.vcf_suffix,
                   revseq, s, s, args.gene),
                verbose=1)
def tbprofiler(fq1,fq2,uniq_id,db,storage_dir,platform):
    conf = get_conf_dict(sys.base_prefix+"/share/tbprofiler/tbdb")
    drug_order = ["isoniazid","rifampicin","ethambutol","pyrazinamide","streptomycin","ethionamide","fluoroquinolones","amikacin","capreomycin","kanamycin"]

    if fq1 and fq2:
        fastq_obj = pp.fastq(fq1,fq2)
    elif fq1 and fq2==None:
        fastq_obj = pp.fastq(fq1)
    files_prefix = storage_dir+"/"+uniq_id
    bam_obj = fastq_obj.map_to_ref(
        ref_file=conf["ref"], prefix=files_prefix,sample_name=uniq_id,
        aligner="bwa", platform=platform, threads=4
    )
    bam_file = bam_obj.bam_file

    results = pp.bam_profiler(
        conf=conf, bam_file=bam_file, prefix=files_prefix, platform=platform,
        caller="bcftools", threads=4, no_flagstat=False,
        run_delly = True
    )

    results = tbp.reformat(results, conf, reporting_af=0.1)

    results["id"] = uniq_id
    results["tbprofiler_version"] = tbp._VERSION
    results["pipeline"] = {"mapper":"bcftools","variant_caller":"bcftools"}
    results = tbp.get_summary(results,conf,drug_order=drug_order)
    outfile = "%s.results.json" % (storage_dir+"/"+uniq_id)

    json.dump(results,open(outfile,"w"))



    conn = sqlite3.connect(db)
    c = conn.cursor()
    c.execute("UPDATE results SET result = ?, lineage = ?, drtype = ?, status = 'completed' where id = ?", (open(outfile).readline(),results["sublin"],results["drtype"],uniq_id,))
    c.execute("UPDATE full_results SET main_lineage = ?, sub_lineage = ?, DR_type = ?, MDR = ?, XDR = ?",(results["main_lin"],results["sublin"],results["drtype"],results["MDR"],results["XDR"]))
    for d in results["drug_table"]:
        c.execute("UPDATE full_results SET %s = ? where id = ?" % d["Drug"].lower().replace("-","_"), (d["Mutations"],uniq_id,))
    conn.commit()
    pp.run_cmd("rm %s/%s*" % (storage_dir,uniq_id))

    return True
def get_sample_meta(samples, debug=False):
    if not args.debug:
        pp.run_cmd(
            "esearch -db nucleotide -query '%s' | efetch -format gb  > temp.gb"
            % ",".join(samples))
    data = []
    for seq_record in SeqIO.parse(open("temp.gb"), "gb"):
        sample = seq_record.id.split(".")[0]
        source = [
            feat for feat in seq_record.features if feat.type == "source"
        ][0]
        country = "NA"
        date = "NA"
        if "country" in source.qualifiers:
            country = source.qualifiers["country"][0].split(":")[0]
        if "collection_date" in source.qualifiers:
            date = source.qualifiers["collection_date"][0]
        data.append({"id": sample, "country": country, "date": date})

    return data
示例#10
0
def main(args):
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir)
            if x[-19:] == ".targets.csq.vcf.gz"
        ]
    sample_fastas = defaultdict(list)
    params = {
        "tmp_locations": pp.get_random_file(),
        "tmp_mappings": pp.get_random_file(),
        "ref": conf["ref"]
    }
    pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" %
               (conf["bed"], params["tmp_mappings"]))
    pp.run_cmd("cut -f1 %s > %s" %
               (params["tmp_mappings"], params["tmp_locations"]))
    FILES = {}
    for l in open(params["tmp_mappings"]):
        row = l.rstrip().split()
        FILES[row[0]] = open("%s.fasta" % row[1], "w")
    for s in samples:
        params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s)
        params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s)
        params["sample_fa"] = "%s.targets.fa" % (s)
        pp.run_cmd(
            "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s"
            % params)
        pp.run_cmd("bcftools index %(tmp_vcf)s" % params)
        pp.run_cmd(
            "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s"
            % params)
        fa_dict = pp.fasta(params["sample_fa"]).fa_dict
        for locus in fa_dict:
            FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus]))
        pp.rm_files([params["tmp_vcf"]])
    pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
示例#11
0
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3):
    conf = json.load(open(conf_file))

    if sample_file:
        samples = [x.rstrip() for x in open(sample_file).readlines()]
    else:
        samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"]

    samples_file = pp.get_random_file()
    OUT = open(samples_file,"w")
    OUT.write("%s\n"%"\n".join(samples))
    OUT.close()
    for s in samples:
        tprefix = s+".genome"
        gbcf_file = "%s.gbcf" % tprefix
        if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)):
            bam_file = "%s/bam/%s.bam" % (base_dir,s)
            bam_obj = pp.bam(bam_file,s,conf["ref"])
            bam_obj.gbcf(prefix=tprefix)
            pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir))
    cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir)
    print(cmd)
示例#12
0
def run_profile(uniq_id, storage_dir, fasta=None, R1=None, R2=None):
    cp.log("This is the worker. Running %s" % uniq_id)
    if fasta:
        pp.run_cmd(
            "covid-profiler.py profile --fasta %s --prefix %s --dir %s" %
            (fasta, uniq_id, storage_dir))
    elif R1 and not R2:
        pp.run_cmd("covid-profiler.py profile -1 %s --prefix %s --dir %s" %
                   (R1, uniq_id, storage_dir))
    elif R1 and R2:
        pp.run_cmd(
            "covid-profiler.py profile -1 %s -2 %s --prefix %s --dir %s" %
            (R1, R2, uniq_id, storage_dir))
    else:
        sys.stderr.write("ERROR!!! Check file inputs to profile worker!")
    pp.run_cmd("zip -j %s/%s.zip %s/%s*" %
               (storage_dir, uniq_id, storage_dir, uniq_id))
    results = json.load(open("%s/%s.results.json" % (storage_dir, uniq_id)))

    if R1:
        pp.run_cmd("bcftools view %s/%s.vcf.gz > %s/%s.vcf" %
                   (storage_dir, uniq_id, storage_dir, uniq_id))
        for l in pp.cmd_out(
                "bedtools genomecov -ibam %s/%s.bam -d | datamash mean 3" %
            (storage_dir, uniq_id)):
            cp.log(l)
            results["mean_depth"] = round(float(l.strip()), 2)
    results["num_variants"] = len(results["variants"])

    client = MongoClient()
    db = client.test_database
    db.profiler_results.find_one_and_update(
        {"_id": uniq_id}, {"$set": {
            "results": results,
            "status": "done"
        }})

    return True
示例#13
0
def main_profile(args):
    if args.external_db:
        conf = get_conf_dict(args.external_db)
    else:
        conf = get_conf_dict(sys.base_prefix +
                             "/share/tbprofiler/%s" % args.db)
    if not args.prefix:
        args.prefix = args.bam.split("/")[-1].replace(".bam",
                                                      "").replace(".cram", "")
    bam_obj = pp.bam(args.bam, args.prefix, platform=args.platform)
    vcf_obj = bam_obj.call_variants(conf["ref"],
                                    caller=args.caller,
                                    bed_file=conf["bed"],
                                    threads=args.threads)
    csq_vcf_obj = vcf_obj.csq(conf["ref"], conf["gff"])
    csq = csq_vcf_obj.load_csq(ann_file=conf["ann"])
    results = {"variants": []}
    for sample in csq:
        results["variants"] = csq[sample]
    outfile = "%s%s" % (args.prefix, args.suffix)
    json.dump(results, open(outfile, "w"))
    pp.run_cmd("rm %(prefix)s.targets.vcf.gz* %(prefix)s.targets.csq.vcf.gz*" %
               vars(args))
示例#14
0
def main(args):
    vcf = vcf_class(args.vcf)
    # vcf.get_mean_genotype()
    if args.genes:
        vcf.get_genesum()
    geno_file = vcf.prefix + ".geno"
    genesum_file = vcf.prefix + ".genesum"
    meta = {}
    for s in vcf.samples:
        meta[s] = {}
    for row in csv.DictReader(open(args.csv)):
        for pheno in row.keys():
            if pheno == "id": continue
            if row['id'] not in meta: continue
            meta[row["id"]][pheno] = row[pheno]
    phenos = [x.rstrip() for x in open(args.phenos).readlines()]
    cmd_file = pp.get_random_file()
    X = open(cmd_file, "w")
    for pheno in phenos:
        pheno_file = "%s.pheno" % pheno
        if pheno not in row:
            pp.log("%s not in CSV file" % pheno, True)
        P = open(pheno_file, "w")
        P.write("\n".join([
            meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples
        ]))
        P.close()
        X.write(
            "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s.genesum -notsnp\n"
            % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno,
               pheno, pheno_file, genesum_file, pheno, pheno))
    X.close()

    if args.preprocess:
        pp.log("Preprocessing finished\n", True)
    else:
        pp.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
示例#15
0
def vcf2consensus(bam, vcf, ref, id, consensus):
    tmp_bed = pp.get_random_file()
    pp.run_cmd(
        'bedtools genomecov -d -ibam %s | awk \'$3<10\' | awk \'{print $1"\\t"$2"\\t"$2}\' > %s'
        % (bam, tmp_bed))
    pp.run_cmd(
        "bcftools consensus -f %s -m %s -M N %s | sed 's/^>.*/>%s/' > %s" %
        (ref, tmp_bed, vcf, id, consensus))
    pp.run_cmd("rm %s" % tmp_bed)
示例#16
0
def profile_primer(primerF, primerR, probe, uniq_id, save_dir):
    pp.run_cmd(
        "covid-profiler.py primer --primerF %s --primerR %s --probe %s --out %s/%s.csv"
        % (primerF, primerR, probe, save_dir, uniq_id))
    pp.run_cmd("covid_plot_primers.R %s/%s.csv %s %s %s" %
               (save_dir, uniq_id, primerF, primerR, probe))
    pp.run_cmd("rm %s/%s.csv" % (save_dir, uniq_id))
    client = MongoClient()
    db = client.test_database
    db.primer_results.find_one_and_update({"_id": uniq_id},
                                          {"$set": {
                                              "status": "done"
                                          }})

    return True
示例#17
0
def run_phylogeny(file, uniq_id, working_dir="/tmp/"):
    cp.log("This is the worker. Running %s" % uniq_id)
    pp.run_cmd(
        "covid_profiler_align_fasta.py --fasta %s --working-dir %s --out %s" %
        (file, working_dir, uniq_id))
    return True
示例#18
0
def profile_vcf(filename, conf):
    params = conf.copy()
    params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz")
    params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz")
    params["filename"] = filename
    params["tmphdr"] = pp.get_random_file()
    params["tmptxt"] = pp.get_random_file()
    l = ""
    for l in pp.cmd_out(
            "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" %
            params):
        pass
    AD_found = False if l == "" else True
    if AD_found == False:
        open(params["tmphdr"], "w").write(
            "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n"
        )
        pp.run_cmd(
            "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s"
            % params)
        pp.run_cmd("bgzip %(tmptxt)s" % params)
        pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params)
        pp.run_cmd(
            "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s"
            % params)
    else:
        pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params)
    pp.run_cmd(
        "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s  -Oz -o %(tmpcsq)s -p a"
        % params)
    csq_bcf_obj = pp.bcf(params["tmpcsq"])
    csq = csq_bcf_obj.load_csq(ann_file=conf["ann"])
    results = {
        "variants": [],
        "missing_pos": [],
        "qc": {
            "pct_reads_mapped": "NA",
            "num_reads_mapped": "NA"
        }
    }
    for sample in csq:
        results["variants"] = csq[sample]
    all_bcf_obj = pp.bcf(params["tmpvcf"])
    mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][
            325505]["C"] == 50:
        mutations["Chromosome"][325505] = {"T": 25}
    if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][
            599868]["G"] == 50:
        mutations["Chromosome"][599868] = {"A": 25}
    if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][
            931123]["C"] == 50:
        mutations["Chromosome"][931123] = {"T": 25}
    if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][
            1759252]["T"] == 50:
        mutations["Chromosome"][1759252] = {"G": 25}
    json.dump(mutations, open("dump.json", "w"))
    barcode_mutations = pp.barcode(mutations, conf["barcode"])
    results["barcode"] = barcode_mutations
    results = pp.db_compare(db_file=conf["json_db"], mutations=results)
    bed_regions = pp.load_bed(conf["bed"], [4], 4)
    missing_regions = {gene: "NA" for gene in bed_regions}
    results["missing_regions"] = missing_regions
    if AD_found:
        pp.run_cmd("rm %(tmpcsq)s" % params)
    else:
        pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params)
    return results
示例#19
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    samples_with_mutation = []
    variant_position_set = set()
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant:
                samples_with_mutation.append(s)
                variant_position_set.add(var["genome_pos"])

    sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation))
    # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"]
    if len(samples_with_mutation)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found"))
        quit()
    elif len(variant_position_set)>1:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos"))
        quit()


    if len(variant_position_set)==1:
        variant_position = int(list(variant_position_set)[0])

    sys.stderr.write("\nGenome position is %s\n" % variant_position)
    sys.stderr.write("\nPerforming ReadPosRankSum test\n")
    # variant_position = 3841662
    params = vars(args)
    params["ref"] = conf["ref"]
    params["pos"] = variant_position
    params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz")
    read_pos_rank_sums = []
    for s in tqdm(samples_with_mutation):
        params["sample"] = s
        pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0)
        pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0)
        pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0)
        for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false  | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0):
            row = l.strip().split()
            if row[1]==".": continue
            if int(row[0])==variant_position:
                read_pos_rank_sums.append((s,float(row[1])))

    if len(read_pos_rank_sums)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples"))
    else:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums])))
    pp.rm_files([params["tmp_vcf"]])
def main(args):

    vcf_class = pp.vcf(args.vcf)
    vcf_positions = vcf_class.get_positions()

    if not args.fasta:
        if not args.ref:
            sys.stderr.write(
                "\nERROR: Please supply a reference with --ref\n\n")
            quit()
        pp.run_cmd(
            "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" %
            vars(args))
        args.fasta = "%s.snps.fa" % vcf_class.prefix
    if pp.nofile("%s.asr.state" % args.fasta):
        pp.run_cmd(
            "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr"
            % vars(args))

    tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    states_file = "%s.asr.state" % args.fasta
    states = defaultdict(dict)
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]

    seqs = pp.fasta(args.fasta).fa_dict
    for site in tqdm(list(states)):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"])
    convergent_sites = []
    for site in tqdm(list(states)):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objects
        origins = []

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state in acgt and n.get_ancestors(
            )[0].state in acgt:
                origins.append(n.name)
            n.add_feature("state", node_state)
        if len(origins) > 1:
            convergent_sites.append((site, vcf_positions[site - 1], origins))

    with open(args.out, "w") as O:
        for site in convergent_sites:
            O.write("%s\t%s\n" % (site[1][1], len(site[2])))
示例#21
0
def get_variant_data(vcf_file, ref_file, gff_file, protein_file):
    nsp_data = {}
    gene_info = {}
    for row in csv.DictReader(open(protein_file)):
        row["Start"] = int(row["Start"])
        row["End"] = int(row["End"])
        gene_info[row["Gene"]] = {
            "function": row["Putative function"],
            "DOI": row["DOI"]
        }
        if row["Region"] != "nsp": continue
        for i in range(row["Start"], row["End"] + 1):
            nsp_data[i] = row

    pp.run_cmd("samtools faidx %s" % ref_file)
    results = {}
    for l in pp.cmd_out(
            "bcftools view %s | bcftools csq -f %s -g %s  | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'"
            % (vcf_file, ref_file, gff_file)):
        pos, ref, alts_str, af_str, csq_str = l.strip().split()
        alt_af = sum([float(x) for x in af_str.split(",")])
        csqs = csq_str.split(",")
        types = []
        changes = []
        genes = []
        pos = int(pos)
        for i in range(len(csqs)):
            if csqs[i][0] == "@":
                results[pos] = results[int(csqs[i][1:])]

            elif csqs[i] == ".":
                results[pos] = {
                    "pos": pos,
                    "alts": alts_str,
                    "alt_af": alt_af,
                    "types": "intergenic",
                    "changes": "NA",
                    "gene": "NA",
                    "gene_function": "NA",
                    "gene_reference": "NA"
                }

            else:
                csq = csqs[i].split("|")
                types.append(csq[0].replace("*", ""))

                if csq[1] == "orf1ab":
                    codon_pos = get_codon_num(csq[5])
                    if codon_pos in nsp_data:
                        genes.append(nsp_data[codon_pos]["Gene"])
                        codon_pos = codon_pos - nsp_data[codon_pos]["Start"] + 1
                        changes.append(change_codon_number(csq[5], codon_pos))
                    else:
                        genes.append("orf1ab")
                        changes.append(csq[5])
                else:
                    changes.append(csq[5])
                    genes.append(csq[1])
                if len(set(types)) == 1:
                    types = list(set(types))
                results[pos] = {
                    "pos": pos,
                    "alts": alts_str,
                    "alt_af": alt_af,
                    "types": ",".join(types),
                    "changes": ",".join(changes),
                    "gene": genes[0],
                    "gene_function": gene_info[genes[0]]["function"],
                    "gene_reference": gene_info[genes[0]]["DOI"]
                }
    return results
示例#22
0
def create_db(args, extra_files=None):
    variables = json.load(open("variables.json"))
    genome_file = "%s.fasta" % args.prefix
    gff_file = "%s.gff" % args.prefix
    bed_file = "%s.bed" % args.prefix
    json_file = "%s.dr.json" % args.prefix
    version_file = "%s.version.json" % args.prefix

    if not extra_files:
        extra_files = {}

    if args.match_ref:
        chrom_conversion = match_ref_chrom_names(args.match_ref,
                                                 "genome.fasta")
        shutil.copyfile(args.match_ref, genome_file)
    else:
        chrom_conversion = match_ref_chrom_names("genome.fasta",
                                                 "genome.fasta")
        shutil.copyfile("genome.fasta", genome_file)

    with open(gff_file, "w") as O:
        for l in open("genome.gff"):
            if l.strip() == "": continue
            if l[0] == "#":
                O.write(l)
            else:
                row = l.strip().split()
                if row[0] in chrom_conversion:
                    row[0] = chrom_conversion[row[0]]
                    O.write("\t".join(row) + "\n")

    genes = load_gff(gff_file)
    gene_name2gene_id = {g.name: g.locus_tag for g in genes.values()}
    gene_name2gene_id.update(
        {g.locus_tag: g.locus_tag
         for g in genes.values()})
    db = {}
    locus_tag_to_drug_dict = defaultdict(set)
    with open(args.prefix + ".conversion.log", "w") as L:
        if args.csv:
            mutation_lookup = get_snpeff_formated_mutation_list(
                args.csv, "genome.fasta", "genome.gff",
                json.load(open("variables.json"))["snpEff_db"])
            for row in csv.DictReader(open(args.csv)):
                locus_tag = gene_name2gene_id[row["Gene"]]
                drug = row["Drug"].lower()
                mut = mutation_lookup[(row["Gene"], row["Mutation"])]
                if args.include_original_mutation:
                    row["original_mutation"] = row["Mutation"]
                if mut != row["Mutation"]:
                    L.write(
                        f"Converted {row['Gene']} {row['Mutation']} to {mut}\n"
                    )
                locus_tag_to_drug_dict[locus_tag].add(drug)
                if locus_tag not in db:
                    db[locus_tag] = {}
                if mut not in db[locus_tag]:
                    db[locus_tag][mut] = {"annotations": []}

                tmp_annotation = {"type": "drug", "drug": row["Drug"]}
                annotation_columns = set(row.keys()) - set(
                    ["Gene", "Mutation", "Drug"])
                for col in annotation_columns:
                    if row[col] == "": continue
                    tmp_annotation[col.lower()] = row[col]
                db[locus_tag][mut]["annotations"].append(tmp_annotation)
                db[locus_tag][mut]["genome_positions"] = get_genome_position(
                    genes[locus_tag], mut)
                db[locus_tag][mut]["chromosome"] = genes[locus_tag].chrom

        if args.other_annotations:
            mutation_lookup = get_snpeff_formated_mutation_list(
                args.other_annotations, "genome.fasta", "genome.gff",
                json.load(open("variables.json"))["snpEff_db"])
            for row in csv.DictReader(open(args.other_annotations)):
                locus_tag = gene_name2gene_id[row["Gene"]]
                mut = mutation_lookup[(row["Gene"], row["Mutation"])]
                if mut != row["Mutation"]:
                    L.write(
                        f"Converted {row['Gene']} {row['Mutation']} to {mut}\n"
                    )
                if locus_tag not in db:
                    db[locus_tag] = {}
                if mut not in db[locus_tag]:
                    db[locus_tag][mut] = {"annotations": []}
                tmp_annotation = {"type": row["Type"]}
                if args.include_original_mutation:
                    tmp_annotation["original_mutation"] = row["Mutation"]

                for x in row["Info"].split(";"):
                    key, val = x.split("=")
                    tmp_annotation[key.lower()] = val
                    if key == "drug":
                        locus_tag_to_drug_dict[locus_tag].add(val)
                db[locus_tag][mut]["annotations"].append(tmp_annotation)
                db[locus_tag][mut]["genome_positions"] = get_genome_position(
                    genes[locus_tag], mut)
                db[locus_tag][mut]["chromosome"] = genes[locus_tag].chrom

        if args.watchlist:
            for row in csv.DictReader(open(args.watchlist)):
                locus_tag = gene_name2gene_id[row["Gene"]]
                for d in row["Drug"].split(","):
                    drug = d.lower()
                    locus_tag_to_drug_dict[locus_tag].add(drug)

        version = {"name": args.prefix}
        if not args.custom:
            for l in cmd_out("git log | head -4"):
                row = l.strip().split()
                if row == []: continue
                version[row[0].replace(":", "")] = " ".join(row[1:])
            version["commit"] = version["commit"][:7]
        else:
            version["Date"] = str(
                datetime.now()) if not args.db_date else args.db_date
            version["name"] = args.db_name if args.db_name else "NA"
            version["commit"] = args.db_commit if args.db_name else "NA"
            version["Author"] = args.db_author if args.db_author else "NA"

        json.dump(version, open(version_file, "w"))
        json.dump(db, open(json_file, "w"))

        if "barcode" in extra_files:
            barcode_file = f"{args.prefix}.{extra_files['barcode']}"

            with open(barcode_file, "w") as O:
                for l in open("barcode.bed"):
                    if l[0] == "#": continue
                    row = l.strip().split("\t")
                    row[0] = chrom_conversion[row[0]]
                    O.write("\t".join(row) + "\n")

        if "amplicon_primers" in vars(args) and args.amplicon_primers:
            write_amplicon_bed(genome_file, genes, db, args.amplicon_primers,
                               bed_file)
            variables['amplicon'] = True
        else:
            ref_fasta_dict = fa2dict(genome_file)
            write_bed(db, locus_tag_to_drug_dict, genes, ref_fasta_dict,
                      bed_file)
            variables['amplicon'] = False

        for file in extra_files.values():
            target = f"{args.prefix}.{file}"
            shutil.copyfile(file, target)

        if list(chrom_conversion.keys()) != list(chrom_conversion.values()):
            variables["chromosome_conversion"] = {
                "target": list(chrom_conversion.keys()),
                "source": list(chrom_conversion.values())
            }
        variables_file = args.prefix + ".variables.json"
        variables["files"] = {
            "ref": genome_file,
            "gff": gff_file,
            "bed": bed_file,
            "version": version_file,
            "json_db": json_file,
            "variables": variables_file
        }
        if extra_files:
            for key, val in extra_files.items():
                variables["files"][key] = f"{args.prefix}.{val}"
        json.dump(variables, open(variables_file, "w"))

        if os.path.isfile("snpEffectPredictor.bin"):
            snpeff_db_name = json.load(open("variables.json"))["snpEff_db"]
            load_snpEff_db("snpEffectPredictor.bin", snpeff_db_name)

        if args.load:
            load_dir = f"{sys.base_prefix}/share/{args.software_name}"
            if not os.path.isdir(load_dir):
                os.mkdir(load_dir)

            for key, val in variables['files'].items():
                target = f"{load_dir}/{val}"
                infolog(f"Copying file: {val} ---> {target}")
                shutil.copyfile(val, target)
                if key == "ref":
                    pp.run_cmd(f"bwa index {target}")
                    pp.run_cmd(f"samtools faidx {target}")
                    tmp = target.replace(".fasta", "")
                    pp.run_cmd(f"samtools dict {target} -o {tmp}.dict")

            successlog("Sucessfully imported library")
示例#23
0
def main_profile(args):
    #### Setup conf dictionary ###
    if args.db == "tbdb" and not args.external_db and pp.nofile(
            sys.base_prefix + "/share/tbprofiler/tbdb.fasta"):
        pp.log(
            "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag"
            % sys.base_prefix,
            ext=True)
    if args.external_db:
        conf = get_conf_dict(args.external_db)
    else:
        conf = get_conf_dict(sys.base_prefix +
                             "/share/tbprofiler/%s" % args.db)

    ### Create folders for results if they don't exist ###
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)

    for x in ["bam", "vcf", "results"]:
        if pp.nofolder(args.dir + "/" + x):
            os.mkdir(args.dir + "/" + x)

    ### Set up platform dependant parameters ###
    if args.platform == "nanopore":
        args.mapper = "minimap2"
        args.caller = "bcftools"
        args.no_trim = True
        run_delly = False
    else:
        if args.no_delly:
            run_delly = False
        else:
            run_delly = True

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    ### Create bam file if fastq has been supplied ###
    if args.bam == None:
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        else:
            exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n")
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        bam_file = bam_obj.bam_file
    else:
        bam_file = args.bam

    print(args.delly_bcf_file)
    run_coverage = False if args.no_coverage else True
    ### Run profiling module from pathogen-profiler ###
    results = pp.bam_profiler(
        conf=conf,
        bam_file=bam_file,
        prefix=files_prefix,
        platform=args.platform,
        caller=args.caller,
        threads=args.threads,
        no_flagstat=args.no_flagstat,
        run_delly=run_delly,
        calling_params=args.calling_params,
        coverage_fraction_threshold=args.coverage_fraction_threshold,
        missing_cov_threshold=args.missing_cov_threshold,
        delly_bcf_file=args.delly_bcf_file)
    json.dump(results, open(args.prefix + ".tmp_results.json", "w"))
    ### Reformat the results to TB-Profiler style ###
    results = tbp.reformat(results, conf, reporting_af=args.reporting_af)
    results["id"] = args.prefix
    results["tbprofiler_version"] = tbp._VERSION
    results["pipeline"] = {
        "mapper": args.mapper if not args.bam else "N/A",
        "variant_caller": args.caller
    }

    json_output = args.dir + "/results/" + args.prefix + ".results.json"
    tex_output = args.dir + "/results/" + args.prefix + ".results.tex"
    text_output = args.dir + "/results/" + args.prefix + ".results.txt"
    csv_output = args.dir + "/results/" + args.prefix + ".results.csv"

    json.dump(results, open(json_output, "w"))
    extra_columns = [x.lower() for x in args.add_columns.split(",")
                     ] if args.add_columns else []
    if args.pdf:
        tbp.write_tex(results, conf, tex_output, extra_columns)
        pp.run_cmd("pdflatex %s" % tex_output, verbose=1)
        pp.rm_files([
            tex_output, args.dir + "/" + args.prefix + ".results.aux",
            args.dir + "/" + args.prefix + ".results.log"
        ])
    if args.txt:
        tbp.write_text(results,
                       conf,
                       text_output,
                       extra_columns,
                       reporting_af=args.reporting_af)
    if args.csv:
        tbp.write_csv(results, conf, csv_output, extra_columns)

    ### Move files to respective directories ###
    if not args.bam:
        pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args))
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args))
    if run_delly and results["delly"] == "success" and not args.delly_bcf_file:
        pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" %
                   vars(args))

    ### Add meta data to results
    if args.meta:
        for row in csv.DictReader(open(args.meta)):
            if row["id"] == results["id"]:
                for col in row:
                    results["meta_" + col] = row[col]
    pp.log("Profiling finished sucessfully!")
示例#24
0
def main_profile(args):
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    if args.fasta:
        if args.read1 or args.read2:
            sys.stderr.write(
                "Please use --fasta or --read1/2 but not both... Exiting!\n")
            quit()
        fasta_obj = pp.fasta(args.fasta)
        wg_vcf_obj = pp.vcf(
            fasta_obj.get_ref_variants(conf["ref"],
                                       prefix=args.prefix,
                                       file_prefix=files_prefix))
    else:
        if not args.read1:
            sys.stderr.write(
                "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n"
            )
            quit()
        ### Create bam file if fastq has been supplied ###
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        wg_vcf_obj = bam_obj.call_variants(conf["ref"],
                                           args.caller,
                                           remove_missing=True)
        cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"],
                         wg_vcf_obj.samples[0],
                         wg_vcf_obj.prefix + ".consensus.fasta")
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]

    results = {}
    barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    barcode = pp.barcode(barcode_mutations, conf["barcode"])
    clade = ";".join(sorted([d["annotation"] for d in barcode]))
    sys.stdout.write("%s\t%s\n" % (args.prefix, clade))
    results["clade"] = clade

    variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"],
                                       conf["gff"], conf["proteins"])
    results["variants"] = variant_data

    json.dump(results, open("%s.results.json" % files_prefix, "w"))
示例#25
0
def main(args):
    if pp.nofolder(args.out_dir):
        pp.run_cmd("mkdir %s" % args.out_dir)
    conf = {
        "ref": args.ref,
        "gff": args.gff,
        "bed": args.bed,
        "ann": args.ann,
    }
    if args.conf:
        conf = json.load(open(args.conf))
    for x in ["ref", "gff", "bed", "ann"]:
        if conf[x] == None:
            pp.log("%s variable is not defined" % x, True)
    bam_obj = pp.bam(args.bam,
                     args.prefix,
                     conf["ref"],
                     platform=args.platform)
    bcf_obj = bam_obj.call_variants(
        prefix=args.prefix + ".targets",
        call_method=args.call_method,
        gff_file=conf["gff"],
        bed_file=conf["bed"],
        mixed_as_missing=False if args.platform == "Illumina" else True,
        threads=args.threads,
        min_dp=args.min_depth,
        af=args.af,
        caller=args.caller)
    csq = bcf_obj.load_csq(ann_file=conf["ann"])
    variants = []
    chr2gene_pos = {}
    for l in open(conf["ann"]):
        row = l.rstrip().split()
        chr2gene_pos[int(row[1])] = int(row[3])
    for var in list(csq.values())[0]:
        var["_internal_change"] = var["change"]
        var["change"] = pp.reformat_mutations(var["change"], var["type"],
                                              var["gene_id"], chr2gene_pos)
        variants.append(var)
    if not args.no_delly:
        delly_bcf = bam_obj.run_delly()
        deletions = delly_bcf.overlap_bed(conf["bed"])
        for deletion in deletions:
            tmp_change = pp.reformat_mutations(
                "%(chr)s_%(start)s_%(end)s" % deletion, var["type"],
                var["gene_id"], chr2gene_pos)
            tmp = {
                "genome_pos": deletion["start"],
                "gene_id": deletion["region"],
                "chr": deletion["chr"],
                "freq": 1,
                "type": "large_deletion",
                "change": tmp_change
            }
            variants.append(tmp)
    json.dump(variants,
              open("%s/%s.pp-results.json" % (args.out_dir, args.prefix), "w"))
    for x in [
            ".targets.bcf", ".targets.csq.bcf", ".targets.csq.bcf.csi",
            ".targets.delly.bcf", ".targets.delly.bcf.csi",
            ".targets.del_pos.bed", ".targets.gvcf.gz", ".targets.gvcf.gz.csi",
            ".targets.missing.bcf"
    ]:
        if args.no_delly and "delly" in x: continue
        pp.run_cmd("rm %s%s" % (args.prefix, x))
示例#26
0
def run_primer_conservation(primerF, primerR, probe, uniq_id, save_dir):
    pp.run_cmd(
        "primer_analysis.py --fp %s --rp %s --probe %s --dir %s --out %s --write-json"
        % (primerF, primerR, probe, save_dir, uniq_id))

    return True
示例#27
0
def main(args):

    args.uuid = str(uuid4())
    conf = covid_profiler.get_conf_dict(sys.base_prefix +
                                        "/share/covidprofiler/%s" % args.db)
    vars(args).update(conf)

    args.final_aln = "%s/%s.aln" % (args.working_dir, args.out)
    args.final_vcf = "%s/%s.vcf.gz" % (args.working_dir, args.out)
    args.final_csv = "%s/%s.variant_info.csv" % (args.working_dir, args.out)

    for name, seq in pyfastx.Fasta(args.ref, build_index=False):
        ref_seq = seq
    pp.run_cmd(
        "mafft --auto --thread -1 --keeplength --addfragments %(fasta)s %(ref)s > %(working_dir)s/%(uuid)s.aln"
        % vars(args))

    troublesome_sites = set()
    if args.mask_troublesome_sites:
        from urllib.request import urlopen
        with urlopen(
                'https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf'
        ) as response:
            for l in response:
                row = l.decode().strip().split()
                if row[0][0] == "#": continue
                troublesome_sites.add(int(row[1]))
    print(troublesome_sites)
    with open(args.final_aln, "w") as O:
        for entry in tqdm(
                pyfastx.Fasta("%(working_dir)s/%(uuid)s.aln" % vars(args),
                              full_name=True)):
            masked_seq = list(entry.seq.upper())
            for start, end in [(1, 265), (29675, 29903)]:
                for i in range(start - 1, end):
                    masked_seq[i] = "N"
            for pos in troublesome_sites:
                masked_seq[pos - 1] = "N"
            acgt = set(["A", "C", "G", "T"])
            for pos in [i for i, n in enumerate(masked_seq) if n not in acgt]:
                masked_seq[pos] = "N"
            O.write(">%s\n%s\n" % (entry.name, "".join(masked_seq)))

    pp.run_cmd(
        "snp-sites -v %(final_aln)s  | covid_profiler_vcf_fix_ref.py --ref %(ref)s | covid_profiler_vcf_mask_non_acgt.py  | tqdm | bcftools view -a -Oz -o %(uuid)s.bed_masked.vcf.gz"
        % vars(args))
    pp.run_cmd(
        "bcftools stats -s - %(uuid)s.bed_masked.vcf.gz > %(uuid)s.bed_masked.vcf.gz.stats"
        % vars(args))
    pp.run_cmd(
        "bcftools norm -m -  %(uuid)s.bed_masked.vcf.gz -Oz -o %(final_vcf)s" %
        vars(args))

    variant_data = covid_profiler.get_variant_data(args.final_vcf, conf["ref"],
                                                   conf["gff"],
                                                   conf["proteins"])
    with open(args.final_csv, "w") as O:
        fieldnames = list(variant_data[0].keys())
        writer = csv.DictWriter(O, fieldnames)
        writer.writeheader()
        writer.writerows(variant_data)

    pp.run_cmd("iqtree -s %(final_aln)s -m GTR+F+G4 -nt 1" % vars(args))
    sys.stderr.write("\n\n----------------\n")
    sys.stderr.write("Program complete\n")
    sys.stderr.write("----------------\n")
    sys.stderr.write("Alignment: %s\n" % args.final_aln)
    sys.stderr.write("VCF: %s\n" % args.final_vcf)
    sys.stderr.write("Variant summary csv: %s\n" % args.final_csv)