Пример #1
0
def sort_vcf_file(filename):
    from genomicode import vcflib
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    vcf = vcflib.read(filename)
    CHROM = vcf.matrix["#CHROM"]
    POS = vcf.matrix["POS"]
    POS = [int(x) for x in POS]

    # Check if POS is sorted.  If it's already sorted, then return.
    is_sorted = True
    for i in range(len(CHROM) - 1):
        c1, p1 = CHROM[i], POS[i]
        c2, p2 = CHROM[i + 1], POS[i + 1]
        if c1 != c2:
            continue
        if p2 < p1:
            is_sorted = False
            break
    if is_sorted:
        return

    # Sort by CHROM and POS.
    S = ["%s:%d" % (CHROM[i], POS[i]) for i in range(len(CHROM))]
    O = jmath.order_list(S, natural=True)
    vcf.matrix = AnnotationMatrix.rowslice(vcf.matrix, O)
    vcflib.write(filename, vcf)
Пример #2
0
def fix_vcf_file(sample, infile, outfile):
    # JointSNVMix produces VCF files that don't have FORMAT and
    # <SAMPLE> columns.  Add them.
    from genomicode import vcflib

    vcf = vcflib.read(infile)
    matrix = vcf.matrix

    genotype_names = ["DP", "RD", "AD", "FREQ"]

    # Get the calls for each variant.
    all_genotypes = []  # one for each variant
    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)
        call = vcflib.get_call(var, None)
        geno_dict = {
            "DP": call.total_reads,
            "RD": call.num_ref,
            "AD": call.num_alt,
            "FREQ": call.vaf,
        }
        x = vcflib._format_genotype(genotype_names, geno_dict)
        all_genotypes.append(x)

    # Add FORMAT.
    FORMAT_STRING = ":".join(genotype_names)
    assert "FORMAT" not in matrix
    matrix.headers.append("FORMAT")
    matrix.headers_h.append("FORMAT")
    matrix.header2annots["FORMAT"] = [FORMAT_STRING] * matrix.num_annots()

    # Add the sample.
    assert not vcf.samples
    assert sample not in matrix
    matrix.headers.append(sample)
    matrix.headers_h.append(sample)
    matrix.header2annots[sample] = all_genotypes
    vcf.samples = [sample]

    # Add the proper header lines.
    lines = [
        '##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Allelic depth for the ref allele in the tumor sample">',
        '##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allelic depth for the alt allele in the tumor sample">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">',
        '##FORMAT=<ID=FREQ,Number=1,Type=Integer,Description="Variant allele frequency">',
    ]
    matrix.headerlines.extend(lines)

    vcflib.write(outfile, vcf)
def filter_by_vartype(vartype, infile, outfile):
    # Filter out snps or indels.
    import shutil
    from genomicode import vcflib

    assert vartype in ["all", "snp", "indel"]

    if vartype == "all":
        shutil.copy2(infile, outfile)
        return
    vcf = vcflib.read(infile)
    fn = is_snp
    if vartype == "indel":
        fn = is_indel
    vcf = vcflib.select_variants(vcf, fn)
    vcflib.write(outfile, vcf)
Пример #4
0
def backfill_vcf(in_file, bf_file, out_file):
    import copy
    from genomicode import vcflib

    #print in_mvcf_node.identifier
    #print back_mvcf_node.identifier
    in_vcf = vcflib.read(in_file)
    bf_vcf = vcflib.read(bf_file)

    # May have multiple samples, e.g. germline and tumor.
    #assert len(in_vcf.samples) == 1, "Too many samples: %s" % in_vcf.samples
    x = [x for x in in_vcf.samples if x in bf_vcf.samples]
    SAMPLES = x

    # Parse out the read counts from the backfill vcf.
    bf_variants = {}  # (sample, chrom, pos) -> ref, alt, Variant, Call
    for i in range(bf_vcf.num_variants()):
        var = vcflib.get_variant(bf_vcf, i)
        for sample in SAMPLES:
            call = vcflib.get_call(var, sample)

            if call.num_ref is None and call.num_alt is None and \
               call.total_reads is None and call.vaf is None:
                continue
            x = sample, var.chrom, var.pos
            assert x not in bf_variants, "Duplicate: %s %s %s" % x
            bf_variants[x] = var.ref, var.alt, var, call

    # Find the variants that can be backfilled.
    # List of (chrom, pos, in_var_num, sample, in_call, bf_var, bf_call)
    matches = []
    for i in range(in_vcf.num_variants()):
        in_var = vcflib.get_variant(in_vcf, i)
        for sample in SAMPLES:
            # Skip if there is no backfill information.
            key = sample, in_var.chrom, in_var.pos
            if key not in bf_variants:
                continue
            bf_ref, bf_alt, bf_var, bf_call = bf_variants[key]
            # Don't worry if the variants match.  Just want a
            # rough estimate of the coverage at this location.
            ## Make sure the variants match.
            ##if not is_same_variants(ref, alt, bf_ref, bf_alt):
            ##    continue
            in_call = vcflib.get_call(in_var, sample)
            x = in_var.chrom, in_var.pos, i, sample, in_call, bf_var, bf_call
            matches.append(x)

    # Update the read counts from annotated VCF file.
    out_vcf = copy.deepcopy(in_vcf)
    add_backfill_genotypes(out_vcf)

    seen = {}
    for x in matches:
        chrom, pos, var_num, sample, in_call, bf_var, bf_call = x
        seen[(sample, chrom, pos)] = 1

        var = vcflib.get_variant(out_vcf, var_num)
        GD = var.sample2genodict[sample]

        mapping = [
            ("BFILL_REF", "num_ref"),
            ("BFILL_ALT", "num_alt"),
            ("BFILL_COV", "total_reads"),
            ("BFILL_VAF", "vaf"),
        ]
        changed = False
        for gt_key, call_attr in mapping:
            x = getattr(bf_call, call_attr)
            if x is None:
                continue
            if type(x) is type([]):  # arbitrarily use max
                x = max(x)
            GD[gt_key] = vcflib._format_vcf_value(x)
            changed = True
        if changed:
            vcflib.set_variant(out_vcf, var_num, var)

    # Add the variants that are in bf_file, but not in in_file.
    for x in bf_variants:
        # sample, chrom, pos = x
        if x in seen:
            continue
        bf_ref, bf_alt, bf_var, bf_call = bf_variants[x]
        # VarScan sets the filter_ to "PASS" for everything.  Get rid
        # of this.
        bf_var.filter_ = ["BACKFILL"]
        vcflib.add_variant(out_vcf, bf_var)

    vcflib.write(out_file, out_vcf)
Пример #5
0
def make_cancer_samples_file(vcf_file, nc_match, outfile):
    # Two column tab-delimited text.  No headers.
    # <germline>  <tumor>
    from genomicode import vcflib
    from genomicode import hashlib
    from genomicode import jmath

    # vcf samples (joined with bcftools).
    # PIM005_G   peak1   2:PIM001_G      peak2   3:PIM001_G   [...]

    germline_samples = [x[0] for x in nc_match]
    tumor_samples = [x[1] for x in nc_match]

    # Hopefully should be able to find the samples in the first 1000
    # rows.
    vcf = vcflib.read(vcf_file, nrows=1000)

    # Get the samples from the VCF file.
    samples = vcf.samples

    # HACK: Fix some problems with old files.
    #samples = [x.replace("Cap475-5983-19", "PIM001_G") for x in samples]

    # HACK: Radia has calls from RNA.  Ignore them.
    # <tumor_sample>_RNA
    rna = {}.fromkeys(["%s_RNA" % x for x in tumor_samples])
    samples = [x for x in samples if x not in rna]

    # Samples may be hashed, e.g.
    # 196B-MG -> X196B_MG
    # Need to compare against hashed samples.
    germline_samples_h = [hashlib.hash_var(x) for x in germline_samples]
    tumor_samples_h = [hashlib.hash_var(x) for x in tumor_samples]
    # Make sure hashing does not make duplicate tumor samples.
    # Germline may be duplicated.
    #assert not _dups(germline_samples)
    assert not _dups(tumor_samples)
    #assert not _dups(germline_samples_h)
    assert not _dups(tumor_samples_h)

    # Clean up samples.
    clean = []  # list of tuples ("G" or "T", sample_name)
    for sample in samples:
        if sample in germline_samples:
            x = "G", sample
        elif sample in germline_samples_h:
            # Don't unhash it.  Otherwise, snpeff will be confused.
            #i = germline_samples_h[sample]
            #x = "G", germline_samples[i]
            x = "G", sample
        elif sample in tumor_samples:
            x = "T", sample
        elif sample in tumor_samples_h:
            #i = tumor_samples_h[sample]
            #x = "T", tumor_samples[i]
            x = "T", sample
        else:
            # <num>:<germline sample name>
            x = sample.split(":", 1)
            assert len(x) == 2, "Unknown sample name (%s) in: %s" % (sample,
                                                                     vcf_file)
            assert jmath.is_int(
                x[0]), "Unknown sample name (%s) in: %s" % (sample, vcf_file)
            s = x[1]
            if s in germline_samples:
                x = "G", s
            elif s in germline_samples_h:
                #i = germline_samples_h[s]
                #x = "G", germline_samples[i]
                x = "G", s
            else:
                raise AssertionError, "Unknown sample name: %s" % sample
        clean.append(x)
    samples = clean

    # If there are no germline samples, then don't make a file.
    x1 = [x for x in samples if x[0] == "G"]
    x2 = [x for x in samples if x[0] == "T"]
    if not x1:
        return None
    # Make sure there are the same number of germline samples.
    assert len(x1) == len(x2), "Germline/Tumor mismatch: %s" % vcf_file
    assert len(samples) % 2 == 0

    # Pairs should contain one "G" and one "T".
    for i in range(0, len(samples), 2):
        t1, s1 = samples[i]
        t2, s2 = samples[i + 1]
        assert t1 != t2, "Bad Germline/Tumor ordering: %s" % vcf_file

    lines = []
    for i in range(0, len(samples), 2):
        t1, s1 = samples[i]
        t2, s2 = samples[i + 1]
        # Want germline, then tumor.
        if t1 == "T" and t2 == "G":
            t1, s1, t2, s2 = t2, s2, t1, s1
        assert t1 == "G" and t2 == "T"
        x = "%s\t%s\n" % (s1, s2)
        lines.append(x)
    open(outfile, 'w').writelines(lines)
def parse_snpeff_file(vcf_filename, out_filename):
    from genomicode import vcflib

    # Parse out the snpEff annotations.  Should have ANN in INFO.
    # Make a tab-delimited text file containin columns:
    # Chrom  Pos  Ref  Alt  <snpEff-specific columns>
    #
    # ##INFO=<ID=ANN,Number=.,Type=String,
    #     Description="Functional annotations: '
    #     Allele |
    #     Annotation |
    #     Annotation_Impact |
    #     Gene_Name |
    #     Gene_ID |
    #     Feature_Type |
    #     Feature_ID |
    #     Transcript_BioType |
    #     Rank |
    #     HGVS.c |
    #     HGVS.p |
    #     cDNA.pos / cDNA.length |
    #     CDS.pos / CDS.length |
    #     AA.pos / AA.length |
    #     Distance | ERRORS / WARNINGS / INFO' ">

    vcf = vcflib.read(vcf_filename)

    # Figure out the Functional annotations.
    assert vcf.matrix.headerlines, "No header lines"
    x = [x for x in vcf.matrix.headerlines if x.find("<ID=ANN,") >= 0]
    if not x:
        return
    # No duplicates.
    # The ANN line can end with:
    #   ERRORS / WARNINGS / INFO'">
    #   ERRORS / WARNINGS / INFO' ">
    # I encountered a VCF file that contained two ANN lines differing
    # by this spacing.  Normalize these lines and make sure there are
    # no duplicates.
    x = [
        x.replace("ERRORS / WARNINGS / INFO' \">",
                  "ERRORS / WARNINGS / INFO'\">") for x in x
    ]
    x = {}.fromkeys(x).keys()
    assert len(x) == 1, "Multiple ANN headers: %s" % vcf_filename
    header = x[0]
    x = header.strip()
    TEXT = "Functional annotations:"
    assert TEXT in x
    x = x[x.index(TEXT) + len(TEXT):]  # Get rid of "Functional annotations:"
    assert x.endswith('">')  # No ">
    x = x[:-2].strip()
    assert x.startswith("'") and x.endswith("'")  # No ''
    x = x[1:-1]
    x = x.split("|")
    x = [x.strip() for x in x]
    annotations = x

    handle = open(out_filename, 'w')
    header = ["Chrom", "Pos", "Ref", "Alt"] + annotations
    print >> handle, "\t".join(header)

    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)
        if "ANN" not in var.infodict:
            continue

        # Can have multiple annotations if there are more than one allele.
        # <ALLELE>|...|...|,<ALLELE>|...|...|
        # If this happens, just add them to the file.
        x = var.infodict["ANN"]
        annots = x.split(",")
        for annot in annots:
            x = annot.split("|")
            x = [x.strip() for x in x]
            values = x
            assert len(values) == len(annotations), \
                   "Mismatch annotations %d %d: %s %s %d" % (
                len(annotations), len(values), vcf_filename,
                var.chrom, var.pos)

            alt = ",".join(var.alt)
            x = [var.chrom, var.pos, var.ref, alt] + values
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
Пример #7
0
def summarize_vcf_file(filename, filestem, header, outfilename, lock):
    from genomicode import hashlib
    from genomicode import vcflib

    vcf = vcflib.read(filename)

    lines = []
    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)

        caller_name = var.caller.name
        ref = var.ref
        alt = ",".join(var.alt)
        filter_str = vcf.caller.get_filter(var)

        for sample in var.samples:
            # If sample begins with an integer, there may be a
            # "X" pre-pended to it.  Try to detect this case
            # and fix it.
            clean_sample = sample
            if sample == hashlib.hash_var(filestem):
                clean_sample = filestem

            source = "DNA"
            if caller_name == "Radia":
                # DNA    <clean_sample>       196B-lung
                # RNA    <clean_sample>_RNA   196B-lung_RNA
                # Figure out whether this is RNA and fix it.
                if clean_sample.endswith("_RNA"):
                    clean_sample = clean_sample[:-4]
                    source = "RNA"

            genodict = var.sample2genodict[sample]
            call = vcflib.get_call(var, sample)

            num_ref = vcflib._format_vcf_value(call.num_ref, None_char="")
            num_alt = vcflib._format_vcf_value(call.num_alt, None_char="")
            total_reads = vcflib._format_vcf_value(call.total_reads,
                                                   None_char="")
            vaf = vcflib._format_vcf_value(call.vaf, None_char="")
            call_str = vcflib._format_vcf_value(call.call, None_char="")
            GQ = genodict.get("GQ", "")
            if GQ in [None, "."]:
                GQ = ""

            x = caller_name, filestem, clean_sample, var.chrom, var.pos, \
                ref, alt, source, \
                num_ref, num_alt, total_reads, vaf, filter_str, call_str, GQ
            assert len(x) == len(header)
            x = "\t".join(map(str, x))
            lines.append(x)

            if len(lines) >= 100000:
                x = "\n".join(lines) + "\n"
                lock.acquire()
                handle = open(outfilename, 'a')
                handle.write(x)
                handle.close()
                lock.release()
                lines = []

    x = "\n".join(lines) + "\n"
    lock.acquire()
    handle = open(outfilename, 'a')
    handle.write(x)
    handle.close()
    lock.release()