Python vcf示例，utils.vcf Python示例

示例#1

0

显示文件

def main(debug=None):
    args = docopt(__doc__, argv=debug, version=__version__)
    if args["<vcf>"] == "":
        print(__doc__)
    v = vcf(args["<vcf>"])
    for line in v.output_raw():
        if line.startswith("#CHROM"):
            line = line.split("\t")
            if args["--subst"]:
                find_replace = [re.split("[:=,]", x) for x in args["--subst"]]
                for orig, replacement in find_replace:
                    for n, sample in enumerate(line[9:]):
                        if sample == orig:
                            line[9 + n] = replacement
            if args["--prefix"]:
                line[9:] = [args["--prefix"] + x for x in line[9:]]
            if args["--suffix"]:
                line[9:] = [x + args["--suffix"] for x in line[9:]]
            print '\t'.join(line)
        else:
            print(line.strip())

示例#2

0

显示文件

文件： rename.py 项目： AndersenLab/vcf-kit

def main(debug=None):
    args = docopt(__doc__,
                  argv=debug,
                  version=__version__)
    if args["<vcf>"] == "":
        print(__doc__)
    v = vcf(args["<vcf>"])
    for line in v.output_raw():
        if line.startswith("#CHROM"):
            line = line.split("\t")
            if args["--subst"]:
                find_replace = [re.split("[:=,]", x) for x in args["--subst"]]
                for orig, replacement in find_replace:
                    for n, sample in enumerate(line[9:]):
                        if sample == orig:
                            line[9+n] = replacement
            if args["--prefix"]:
                line[9:] = [args["--prefix"] + x for x in line[9:]]
            if args["--suffix"]:
                line[9:] = [x + args["--suffix"] for x in line[9:]]
            print '\t'.join(line)
        else:
            print(line.strip())

示例#3

0

显示文件

def main(debug=None):
    args = docopt(__doc__,
                  argv=debug,
                  options_first=False,
                  version=__version__)

    def first(s):
        return s[0].replace(".", "N")

    firstv = np.vectorize(first)

    v = vcf(args["<vcf>"])

    if len(v.samples) <= 1:
        exit(puts_err(
            colored.red("\n\tVCF must have at least two samples.\n")))

    if args["<region>"]:
        variant_set = v(args["<region>"])
    else:
        variant_set = v

    if args["fasta"] or args["tree"]:
        """
            Generate an aligned fasta from a VCF file.
        """
        gt_set = np.chararray((0, len(v.samples)))
        gt_set = []
        for line in variant_set:
            if line.is_snp:
                gt_set.append(firstv(line.gt_bases))
        if len(gt_set) == 0:
            exit(puts_err("No genotypes"))
        gt_set = np.vstack(gt_set)
        seqs = zip(v.samples, np.transpose(gt_set))
        if args["fasta"]:
            for sample, seq in seqs:
                print(">" + sample)
                print(''.join(seq))

        elif args["tree"]:
            """
            Generate a phylogenetic tree using an aligned fasta with muscle.
            """

            # Check for muscle dependency
            check_program_exists("muscle")
            fasta = ""
            with indent(4):
                puts_err(colored.blue("\nGenerating Fasta\n"))
            for sample, seq in seqs:
                fasta += ">" + sample + "\n" + ''.join(seq) + "\n"
            tree_type = "upgma"  # default is upgma
            if args["nj"]:
                tree_type = "neighborjoining"
            with indent(4):
                puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n"))
            comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type]
            tree, err = Popen(comm, stdin=PIPE,
                              stdout=PIPE).communicate(input=fasta)

            # output tree
            print(tree)

            if args["--plot"]:
                from jinja2 import Template
                import webbrowser
                import tempfile
                prefix = os.path.dirname(
                    os.path.abspath(
                        sys.modules['vcfkit'].__file__)) + "/static"
                template = open(prefix + "/tree.html", 'r').read()
                tree_template = Template(template)
                html_out = tempfile.NamedTemporaryFile(suffix=".html",
                                                       delete=False)
                with html_out as f:
                    tree = tree.replace("\n", "")
                    sample_len = len(v.samples)
                    f.write(tree_template.render(**locals()))
                    webbrowser.open("file://" + html_out.name)

示例#4

0

显示文件

    """ Converts Genotype likelyhoods to phred scaled (PL) genotype likelyhoods. """
    return -int(gl * 10)

debug = None
if len(sys.argv) == 1:
    debug = ['primer', "--ref=WBcel235", "test.vcf.gz"]


if __name__ == '__main__':
    # print debug
    args = docopt(__doc__,
                  version='VCF-Toolbox v0.1',
                  argv=debug,
                  options_first=False)
    # Locate Reference
    v = vcf(args["<vcf>"])
    format_added = False
    if args["transfer-filter"]:
        for line in v.output_raw():
            line = line.strip()
            if line.startswith("#CHROM"):
                # Get Sample information and count
                samples = line.strip().split("\t")[9:]
            elif line.startswith("#"):
                # Add Info line for het polarization flag
                if line.startswith("##FORMAT") and format_added is False:
                    format_added = True
                    line = line + "\n##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Genotype-level filter\">"
            else:
                line = line.split("\t")
                FILTER = line[6]

示例#5

0

显示文件

文件： call.py 项目： AndersenLab/vcf-kit

def main(debug=None):
    args = docopt(__doc__,
                  version='VCF-Toolbox v0.1',
                  argv=debug,
                  options_first=False)


    module_path = os.path.split(os.path.realpath(__file__))[0]
    handle = open(args["<seq>"], "rb")
    reference = resolve_reference_genome(args["--ref"])

    if args["<vcf>"]:
        concordance = True
        v = vcf(args["<vcf>"])
        samples = v.samples


    if args["--vcf-sites"] and args["<vcf>"] is None:
        with indent(4):
            exit(puts_err(colored.red("\nMust specify <vcf> with --vcf-sites\n")))

    # Setup reference for blast call
    b = blast(reference)

    # Set file type:
    sequence_file_type = seq_type(args["<seq>"])

    # Output header
    print("\t".join(blast_variant.output_order))
    for record in SeqIO.parse(handle, sequence_file_type):
        # Resolve sample within fasta line
        sample = resolve_sample_from_line(samples, handle.name)
        if not sample:
            sample = resolve_sample_from_line(samples, record.name)
        blast_results = b.blast_call(record)
        classification = ""
        for n, variant in enumerate(blast_results):
            output_line = False
            if variant is None:
                puts_err(colored.red("No Results for " + sample + " " + record.description))
                continue
            if args["<vcf>"]:
                if n == 0:
                    vcf_variants = []
                    for vcf_variant in v(variant.region()):
                        if sample:
                            gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)])
                            vcf_variants.append([vcf_variant.CHROM,
                                                 vcf_variant.POS,
                                                 gt,
                                                 vcf_variant.REF,
                                                 vcf_variant.ALT])
                            vcf_variant_positions = [x[0:2] for x in vcf_variants]

                chrom_pos =  variant.chrom_pos_allele()[0:2]
                vcf_variant_match = [x for x in vcf_variants if x[0:2] == chrom_pos]
                if vcf_variant_match:
                    vcf_variant_match = vcf_variant_match[0]
                    variant.vcf_gt = vcf_variant_match[2]
                    variant.REF = vcf_variant_match[3]
                    variant.ALT = ','.join(vcf_variant_match[4])
                    variant.fetch_variant_type()
                    if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt:
                        variant.classification = "TN"
                    elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt:
                        variant.classification = "TP"
                    elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt:
                        variant.classification = "FP" 
                    elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt:
                        variant.classification = "FN"
                else:
                    variant.REF = ""
                    variant.ALT = ""
                    variant.fetch_variant_type()
                    variant.classification = ""

                if args["--vcf-sites"] and variant.classification != "":
                    output_line = True
                elif args["--all-sites"] is True:
                    output_line = True
            else:
                if args["--all-sites"]:
                    output_line = True
                elif variant.is_variant:
                    output_line = True
            if output_line:

                variant.sample = sample
                if record.description:
                    variant.description = record.description
                else:
                    variant.description = os.path.split(handle.name)[1]
                print '\t'.join([str(variant)])

示例#6

0

显示文件

文件： vcf2sql.py 项目： AndersenLab/vcf-kit

    tgt = '/'.join([gt_dict[int(x)] for x in re.split("[\|/]", val["GT"])])
    return tgt

debug = None
if len(sys.argv) == 1:
    debug = ['vcf2sql', "test.vcf.gz"]

if __name__ == '__main__':
    args = docopt(__doc__,
                  argv=debug,
                  options_first=False)

    timestamp = datetime.datetime.now()

    module_path = os.path.split(os.path.realpath(__file__))[0]
    v = vcf(args["<vcf>"])
    vcf_safe = v.filename.replace(".", "_")
    tsv_out = v.filename.replace("vcf", "tsv").replace(
        "bcf", "tsv").replace(".gz", "") + ".gz"

    info_cols = [map(autoconvert, list(x)) + ["INFO"]
                 for x in r_info.findall(v.raw_header)]
    format_cols = [map(autoconvert, list(x)) + ["FORMAT"]
                   for x in r_format.findall(v.raw_header)]

    if args["--simple"]:
        info_cols = [x for x in info_cols if x[0] in simple_fields]
        format_cols = [x for x in format_cols if x[0] in simple_fields]

    if args["sqlite"]:
        db = SqliteDatabase(args["--db"])

示例#7

0

显示文件

def main(debug = None):
    args = docopt(__doc__, 
                  version='VCF-Toolbox v0.1',
                  argv = debug,
                  options_first=False)
    if args["--soft-filter"] and not args["--mode"]:
        exit(message("Must Specify --mode with soft-filter"))
    v = vcf(args["<vcf>"])
    n_samples = len(v.samples) * 1.0
    f = {}
    filter_s = [x for x in args.values() if x in ["REF","HET","ALT","MISSING"]][0]
    # Filter by rate or by number?
    if args["--min"]:
        direction = "<"
        if int(float(args["--min"])) != float(args["--min"]):
            filter_key_min = "r_" + filter_s
            filter_val_min = float(args["--min"])
            filter_type = "FREQUENCY"
        else:
            filter_key_min = filter_s
            filter_val_min = int(float(args["--min"]))
            filter_type = "COUNT"
        filter_value = filter_val_min
    if args["--max"]:
        direction = ">"
        if int(float(args["--max"])) != float(args["--max"]):
            filter_key_max = "r_" + filter_s
            filter_val_max = float(args["--max"])
            filter_type = "FREQUENCY"
        else:
            filter_key_max = filter_s
            filter_val_max = int(float(args["--max"]))
            filter_type = "COUNT"
        filter_value = filter_val_max

    # Output header
    header = v.raw_header.splitlines()
    for n, i in enumerate(header):
        if i.startswith("##FILTER") and args["--soft-filter"]:
            filter_name = args["--soft-filter"]
            filter_line = """##FILTER=<ID={filter_name},Description="Apply filter if {filter_type}({filter_s}) {direction} {filter_value}">""".format(**locals())
            header.insert(n+1, filter_line)
            break
    header = '\n'.join(header) + "\n"
    sys.stdout.write(header)
    for line in v:
        filtered = False
        f["ALT"] = line.num_hom_alt
        f["HET"] = line.num_het
        f["REF"] = line.num_hom_ref
        f["MISSING"] = int(n_samples - line.num_called)
        f["r_ALT"] = f["ALT"] / n_samples
        f["r_HET"] = f["HET"] / n_samples
        f["r_REF"] = f["REF"] / n_samples
        f["r_MISSING"] = f["MISSING"] / n_samples
        if args["--min"]:
            if f[filter_key_min] < filter_val_min:
                filtered = True
        if args["--max"]:
            if f[filter_key_max] > filter_val_max:
                filtered = True
        if args["--soft-filter"]:
            line = str(line).split("\t")
            if args["--mode"] == "x":
                line[6] = "PASS"
            if filtered is False:
                sys.stdout.write('\t'.join(line))
            else:
                if args["--mode"] == "+":
                    if line[6] == "PASS":
                        line[6] = ""
                    line[6] = ';'.join([line[6]] + [args["--soft-filter"]]).strip(";")
                elif args["--mode"] == "x":
                    line[6] = args["--soft-filter"]
                sys.stdout.write('\t'.join(line))
        elif filtered is False:
            sys.stdout.write(str(line))

示例#8

0

显示文件

文件： phylo.py 项目： AndersenLab/vcf-kit

def main(debug=None):
    args = docopt(__doc__,
                  argv=debug,
                  options_first=False,
                  version=__version__)


    def first(s):
        return s[0].replace(".", "N")

    firstv = np.vectorize(first)

    v = vcf(args["<vcf>"])

    if len(v.samples) <= 1:
        exit(puts_err(colored.red("\n\tVCF must have at least two samples.\n")))
    
    if args["<region>"]:
        variant_set = v(args["<region>"])
    else:
        variant_set = v

    if args["fasta"] or args["tree"]:
        """
            Generate an aligned fasta from a VCF file.
        """
        gt_set = np.chararray((0,len(v.samples)))
        gt_set = []
        for line in variant_set:
            if line.is_snp:
                gt_set.append(firstv(line.gt_bases))
        if len(gt_set) == 0:
            exit(puts_err("No genotypes"))
        gt_set = np.vstack(gt_set)
        seqs = zip(v.samples, np.transpose(gt_set))
        if args["fasta"]:
            for sample, seq in seqs:
                print(">" + sample)
                print(''.join(seq))

        elif args["tree"]:
            """
            Generate a phylogenetic tree using an aligned fasta with muscle.
            """

            # Check for muscle dependency
            check_program_exists("muscle")
            fasta = ""
            with indent(4):
                puts_err(colored.blue("\nGenerating Fasta\n"))
            for sample, seq in seqs:
                fasta += ">" + sample + "\n" + ''.join(seq) + "\n"
            tree_type = "upgma"  # default is upgma
            if args["nj"]:
                tree_type = "neighborjoining"
            with indent(4):
                puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n"))
            comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type]
            tree, err = Popen(comm, stdin=PIPE, stdout=PIPE).communicate(input=fasta)
            
            # output tree
            print(tree)
            
            if args["--plot"]:
                from jinja2 import Template
                import webbrowser
                import tempfile
                prefix = os.path.dirname(os.path.abspath(sys.modules['vcfkit'].__file__)) + "/static"
                template = open(prefix + "/tree.html",'r').read()
                tree_template = Template(template)
                html_out = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
                with html_out as f:
                    tree = tree.replace("\n", "")
                    sample_len = len(v.samples)
                    f.write(tree_template.render(**locals()))
                    webbrowser.open("file://" + html_out.name)

示例#9

0

显示文件

文件： phylo.py 项目： snashraf/VCF-kit

def main(debug=None):
    args = docopt(__doc__,
                  argv=debug,
                  options_first=False,
                  version=__version__)
    module_path = os.path.split(os.path.realpath(__file__))[0]
    v = vcf(args["<vcf>"])
    samples = v.samples
    _ROOT = os.path.split(os.path.dirname(vk.__file__))[0]
    if args["fasta"] or args["tree"]:
        """
            Generate an aligned fasta from a VCF file.
        """
        seqs = {}
        for sample in samples:
            seqs[sample] = []
        for line in v:
            if line.is_snp:
                non_missing = [x.replace(".", "-") for x in line.gt_bases]
                sample_gt = zip(samples, [x[-1] for x in non_missing])
            for sample, gt in sample_gt:
                seqs[sample].append(gt)
        if not args["tree"]:
            for sample, seq in seqs.items():
                print(">" + sample)
                print(''.join(seq))
        elif args["tree"]:
            """
            Generate a phylogenetic tree using an aligned fasta with muscle.
            """

            # Check for muscle dependency
            check_program_exists("muscle")
            fasta = ""
            with indent(4):
                puts_err(colored.blue("\nGenerating Fasta\n"))
            for sample, seq in seqs.items():
                fasta += ">" + sample + "\n" + ''.join(seq) + "\n"
            tree_type = "upgma"  # default is upgma
            if args["nj"]:
                tree_type = "neighborjoining"
            with indent(4):
                puts_err(colored.blue("\nGenerating " + tree_type + " Tree\n"))
            comm = ["muscle", "-maketree", "-in", "-", "-cluster", tree_type]
            tree, err = Popen(comm, stdin=PIPE,
                              stdout=PIPE).communicate(input=fasta)
            print(tree)
            if args["--plot"]:
                from jinja2 import Template
                import webbrowser
                import tempfile
                # R code for plotting here!
                prefix = _ROOT + "/static"
                tree_template = Template(
                    open(_ROOT + "/static/tree.html", 'r').read())
                html_out = tempfile.NamedTemporaryFile(suffix=".html",
                                                       delete=False)
                with html_out as f:
                    tree = tree.replace("\n", "")
                    sample_len = len(samples)
                    f.write(tree_template.render(**locals()))
                    # print html_out.name
                    webbrowser.open("file://" + html_out.name)

示例#10

0

显示文件

文件： call.py 项目： xtmgah/VCF-kit

def main(debug=None):
    args = docopt(__doc__,
                  version='VCF-Toolbox v0.1',
                  argv=debug,
                  options_first=False)

    module_path = os.path.split(os.path.realpath(__file__))[0]
    handle = open(args["<seq>"], "rb")
    reference = resolve_reference_genome(args["--ref"])

    if args["<vcf>"]:
        concordance = True
        v = vcf(args["<vcf>"])
        samples = v.samples

    if args["--vcf-sites"] and args["<vcf>"] is None:
        with indent(4):
            exit(
                puts_err(
                    colored.red("\nMust specify <vcf> with --vcf-sites\n")))

    # Setup reference for blast call
    b = blast(reference)

    # Set file type:
    sequence_file_type = seq_type(args["<seq>"])

    # Output header
    print("\t".join(blast_variant.output_order))
    for record in SeqIO.parse(handle, sequence_file_type):
        # Resolve sample within fasta line
        sample = resolve_sample_from_line(samples, handle.name)
        if not sample:
            sample = resolve_sample_from_line(samples, record.name)
        blast_results = b.blast_call(record)
        classification = ""
        for n, variant in enumerate(blast_results):
            output_line = False
            if variant is None:
                puts_err(
                    colored.red("No Results for " + sample + " " +
                                record.description))
                continue
            if args["<vcf>"]:
                if n == 0:
                    vcf_variants = []
                    for vcf_variant in v(variant.region()):
                        if sample:
                            gt = format_gt(
                                vcf_variant.gt_bases[v.samples.index(sample)])
                            vcf_variants.append([
                                vcf_variant.CHROM, vcf_variant.POS, gt,
                                vcf_variant.REF, vcf_variant.ALT
                            ])
                            vcf_variant_positions = [
                                x[0:2] for x in vcf_variants
                            ]

                chrom_pos = variant.chrom_pos_allele()[0:2]
                vcf_variant_match = [
                    x for x in vcf_variants if x[0:2] == chrom_pos
                ]
                if vcf_variant_match:
                    vcf_variant_match = vcf_variant_match[0]
                    variant.vcf_gt = vcf_variant_match[2]
                    variant.REF = vcf_variant_match[3]
                    variant.ALT = ','.join(vcf_variant_match[4])
                    variant.fetch_variant_type()
                    if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt:
                        variant.classification = "TN"
                    elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt:
                        variant.classification = "TP"
                    elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt:
                        variant.classification = "FP"
                    elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt:
                        variant.classification = "FN"
                else:
                    variant.REF = ""
                    variant.ALT = ""
                    variant.fetch_variant_type()
                    variant.classification = ""

                if args["--vcf-sites"] and variant.classification != "":
                    output_line = True
                elif args["--all-sites"] is True:
                    output_line = True
            else:
                if args["--all-sites"]:
                    output_line = True
                elif variant.is_variant:
                    output_line = True
            if output_line:

                variant.sample = sample
                if record.description:
                    variant.description = record.description
                else:
                    variant.description = os.path.split(handle.name)[1]
                print '\t'.join([str(variant)])