def sort_vcf_file(filename): from genomicode import vcflib from genomicode import jmath from genomicode import AnnotationMatrix vcf = vcflib.read(filename) CHROM = vcf.matrix["#CHROM"] POS = vcf.matrix["POS"] POS = [int(x) for x in POS] # Check if POS is sorted. If it's already sorted, then return. is_sorted = True for i in range(len(CHROM) - 1): c1, p1 = CHROM[i], POS[i] c2, p2 = CHROM[i + 1], POS[i + 1] if c1 != c2: continue if p2 < p1: is_sorted = False break if is_sorted: return # Sort by CHROM and POS. S = ["%s:%d" % (CHROM[i], POS[i]) for i in range(len(CHROM))] O = jmath.order_list(S, natural=True) vcf.matrix = AnnotationMatrix.rowslice(vcf.matrix, O) vcflib.write(filename, vcf)
def fix_vcf_file(sample, infile, outfile): # JointSNVMix produces VCF files that don't have FORMAT and # <SAMPLE> columns. Add them. from genomicode import vcflib vcf = vcflib.read(infile) matrix = vcf.matrix genotype_names = ["DP", "RD", "AD", "FREQ"] # Get the calls for each variant. all_genotypes = [] # one for each variant for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) call = vcflib.get_call(var, None) geno_dict = { "DP": call.total_reads, "RD": call.num_ref, "AD": call.num_alt, "FREQ": call.vaf, } x = vcflib._format_genotype(genotype_names, geno_dict) all_genotypes.append(x) # Add FORMAT. FORMAT_STRING = ":".join(genotype_names) assert "FORMAT" not in matrix matrix.headers.append("FORMAT") matrix.headers_h.append("FORMAT") matrix.header2annots["FORMAT"] = [FORMAT_STRING] * matrix.num_annots() # Add the sample. assert not vcf.samples assert sample not in matrix matrix.headers.append(sample) matrix.headers_h.append(sample) matrix.header2annots[sample] = all_genotypes vcf.samples = [sample] # Add the proper header lines. lines = [ '##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Allelic depth for the ref allele in the tumor sample">', '##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allelic depth for the alt allele in the tumor sample">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">', '##FORMAT=<ID=FREQ,Number=1,Type=Integer,Description="Variant allele frequency">', ] matrix.headerlines.extend(lines) vcflib.write(outfile, vcf)
def filter_by_vartype(vartype, infile, outfile): # Filter out snps or indels. import shutil from genomicode import vcflib assert vartype in ["all", "snp", "indel"] if vartype == "all": shutil.copy2(infile, outfile) return vcf = vcflib.read(infile) fn = is_snp if vartype == "indel": fn = is_indel vcf = vcflib.select_variants(vcf, fn) vcflib.write(outfile, vcf)
def backfill_vcf(in_file, bf_file, out_file): import copy from genomicode import vcflib #print in_mvcf_node.identifier #print back_mvcf_node.identifier in_vcf = vcflib.read(in_file) bf_vcf = vcflib.read(bf_file) # May have multiple samples, e.g. germline and tumor. #assert len(in_vcf.samples) == 1, "Too many samples: %s" % in_vcf.samples x = [x for x in in_vcf.samples if x in bf_vcf.samples] SAMPLES = x # Parse out the read counts from the backfill vcf. bf_variants = {} # (sample, chrom, pos) -> ref, alt, Variant, Call for i in range(bf_vcf.num_variants()): var = vcflib.get_variant(bf_vcf, i) for sample in SAMPLES: call = vcflib.get_call(var, sample) if call.num_ref is None and call.num_alt is None and \ call.total_reads is None and call.vaf is None: continue x = sample, var.chrom, var.pos assert x not in bf_variants, "Duplicate: %s %s %s" % x bf_variants[x] = var.ref, var.alt, var, call # Find the variants that can be backfilled. # List of (chrom, pos, in_var_num, sample, in_call, bf_var, bf_call) matches = [] for i in range(in_vcf.num_variants()): in_var = vcflib.get_variant(in_vcf, i) for sample in SAMPLES: # Skip if there is no backfill information. key = sample, in_var.chrom, in_var.pos if key not in bf_variants: continue bf_ref, bf_alt, bf_var, bf_call = bf_variants[key] # Don't worry if the variants match. Just want a # rough estimate of the coverage at this location. ## Make sure the variants match. ##if not is_same_variants(ref, alt, bf_ref, bf_alt): ## continue in_call = vcflib.get_call(in_var, sample) x = in_var.chrom, in_var.pos, i, sample, in_call, bf_var, bf_call matches.append(x) # Update the read counts from annotated VCF file. out_vcf = copy.deepcopy(in_vcf) add_backfill_genotypes(out_vcf) seen = {} for x in matches: chrom, pos, var_num, sample, in_call, bf_var, bf_call = x seen[(sample, chrom, pos)] = 1 var = vcflib.get_variant(out_vcf, var_num) GD = var.sample2genodict[sample] mapping = [ ("BFILL_REF", "num_ref"), ("BFILL_ALT", "num_alt"), ("BFILL_COV", "total_reads"), ("BFILL_VAF", "vaf"), ] changed = False for gt_key, call_attr in mapping: x = getattr(bf_call, call_attr) if x is None: continue if type(x) is type([]): # arbitrarily use max x = max(x) GD[gt_key] = vcflib._format_vcf_value(x) changed = True if changed: vcflib.set_variant(out_vcf, var_num, var) # Add the variants that are in bf_file, but not in in_file. for x in bf_variants: # sample, chrom, pos = x if x in seen: continue bf_ref, bf_alt, bf_var, bf_call = bf_variants[x] # VarScan sets the filter_ to "PASS" for everything. Get rid # of this. bf_var.filter_ = ["BACKFILL"] vcflib.add_variant(out_vcf, bf_var) vcflib.write(out_file, out_vcf)
def make_cancer_samples_file(vcf_file, nc_match, outfile): # Two column tab-delimited text. No headers. # <germline> <tumor> from genomicode import vcflib from genomicode import hashlib from genomicode import jmath # vcf samples (joined with bcftools). # PIM005_G peak1 2:PIM001_G peak2 3:PIM001_G [...] germline_samples = [x[0] for x in nc_match] tumor_samples = [x[1] for x in nc_match] # Hopefully should be able to find the samples in the first 1000 # rows. vcf = vcflib.read(vcf_file, nrows=1000) # Get the samples from the VCF file. samples = vcf.samples # HACK: Fix some problems with old files. #samples = [x.replace("Cap475-5983-19", "PIM001_G") for x in samples] # HACK: Radia has calls from RNA. Ignore them. # <tumor_sample>_RNA rna = {}.fromkeys(["%s_RNA" % x for x in tumor_samples]) samples = [x for x in samples if x not in rna] # Samples may be hashed, e.g. # 196B-MG -> X196B_MG # Need to compare against hashed samples. germline_samples_h = [hashlib.hash_var(x) for x in germline_samples] tumor_samples_h = [hashlib.hash_var(x) for x in tumor_samples] # Make sure hashing does not make duplicate tumor samples. # Germline may be duplicated. #assert not _dups(germline_samples) assert not _dups(tumor_samples) #assert not _dups(germline_samples_h) assert not _dups(tumor_samples_h) # Clean up samples. clean = [] # list of tuples ("G" or "T", sample_name) for sample in samples: if sample in germline_samples: x = "G", sample elif sample in germline_samples_h: # Don't unhash it. Otherwise, snpeff will be confused. #i = germline_samples_h[sample] #x = "G", germline_samples[i] x = "G", sample elif sample in tumor_samples: x = "T", sample elif sample in tumor_samples_h: #i = tumor_samples_h[sample] #x = "T", tumor_samples[i] x = "T", sample else: # <num>:<germline sample name> x = sample.split(":", 1) assert len(x) == 2, "Unknown sample name (%s) in: %s" % (sample, vcf_file) assert jmath.is_int( x[0]), "Unknown sample name (%s) in: %s" % (sample, vcf_file) s = x[1] if s in germline_samples: x = "G", s elif s in germline_samples_h: #i = germline_samples_h[s] #x = "G", germline_samples[i] x = "G", s else: raise AssertionError, "Unknown sample name: %s" % sample clean.append(x) samples = clean # If there are no germline samples, then don't make a file. x1 = [x for x in samples if x[0] == "G"] x2 = [x for x in samples if x[0] == "T"] if not x1: return None # Make sure there are the same number of germline samples. assert len(x1) == len(x2), "Germline/Tumor mismatch: %s" % vcf_file assert len(samples) % 2 == 0 # Pairs should contain one "G" and one "T". for i in range(0, len(samples), 2): t1, s1 = samples[i] t2, s2 = samples[i + 1] assert t1 != t2, "Bad Germline/Tumor ordering: %s" % vcf_file lines = [] for i in range(0, len(samples), 2): t1, s1 = samples[i] t2, s2 = samples[i + 1] # Want germline, then tumor. if t1 == "T" and t2 == "G": t1, s1, t2, s2 = t2, s2, t1, s1 assert t1 == "G" and t2 == "T" x = "%s\t%s\n" % (s1, s2) lines.append(x) open(outfile, 'w').writelines(lines)
def parse_snpeff_file(vcf_filename, out_filename): from genomicode import vcflib # Parse out the snpEff annotations. Should have ANN in INFO. # Make a tab-delimited text file containin columns: # Chrom Pos Ref Alt <snpEff-specific columns> # # ##INFO=<ID=ANN,Number=.,Type=String, # Description="Functional annotations: ' # Allele | # Annotation | # Annotation_Impact | # Gene_Name | # Gene_ID | # Feature_Type | # Feature_ID | # Transcript_BioType | # Rank | # HGVS.c | # HGVS.p | # cDNA.pos / cDNA.length | # CDS.pos / CDS.length | # AA.pos / AA.length | # Distance | ERRORS / WARNINGS / INFO' "> vcf = vcflib.read(vcf_filename) # Figure out the Functional annotations. assert vcf.matrix.headerlines, "No header lines" x = [x for x in vcf.matrix.headerlines if x.find("<ID=ANN,") >= 0] if not x: return # No duplicates. # The ANN line can end with: # ERRORS / WARNINGS / INFO'"> # ERRORS / WARNINGS / INFO' "> # I encountered a VCF file that contained two ANN lines differing # by this spacing. Normalize these lines and make sure there are # no duplicates. x = [ x.replace("ERRORS / WARNINGS / INFO' \">", "ERRORS / WARNINGS / INFO'\">") for x in x ] x = {}.fromkeys(x).keys() assert len(x) == 1, "Multiple ANN headers: %s" % vcf_filename header = x[0] x = header.strip() TEXT = "Functional annotations:" assert TEXT in x x = x[x.index(TEXT) + len(TEXT):] # Get rid of "Functional annotations:" assert x.endswith('">') # No "> x = x[:-2].strip() assert x.startswith("'") and x.endswith("'") # No '' x = x[1:-1] x = x.split("|") x = [x.strip() for x in x] annotations = x handle = open(out_filename, 'w') header = ["Chrom", "Pos", "Ref", "Alt"] + annotations print >> handle, "\t".join(header) for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) if "ANN" not in var.infodict: continue # Can have multiple annotations if there are more than one allele. # <ALLELE>|...|...|,<ALLELE>|...|...| # If this happens, just add them to the file. x = var.infodict["ANN"] annots = x.split(",") for annot in annots: x = annot.split("|") x = [x.strip() for x in x] values = x assert len(values) == len(annotations), \ "Mismatch annotations %d %d: %s %s %d" % ( len(annotations), len(values), vcf_filename, var.chrom, var.pos) alt = ",".join(var.alt) x = [var.chrom, var.pos, var.ref, alt] + values assert len(x) == len(header) print >> handle, "\t".join(map(str, x))
def summarize_vcf_file(filename, filestem, header, outfilename, lock): from genomicode import hashlib from genomicode import vcflib vcf = vcflib.read(filename) lines = [] for i in range(vcf.num_variants()): var = vcflib.get_variant(vcf, i) caller_name = var.caller.name ref = var.ref alt = ",".join(var.alt) filter_str = vcf.caller.get_filter(var) for sample in var.samples: # If sample begins with an integer, there may be a # "X" pre-pended to it. Try to detect this case # and fix it. clean_sample = sample if sample == hashlib.hash_var(filestem): clean_sample = filestem source = "DNA" if caller_name == "Radia": # DNA <clean_sample> 196B-lung # RNA <clean_sample>_RNA 196B-lung_RNA # Figure out whether this is RNA and fix it. if clean_sample.endswith("_RNA"): clean_sample = clean_sample[:-4] source = "RNA" genodict = var.sample2genodict[sample] call = vcflib.get_call(var, sample) num_ref = vcflib._format_vcf_value(call.num_ref, None_char="") num_alt = vcflib._format_vcf_value(call.num_alt, None_char="") total_reads = vcflib._format_vcf_value(call.total_reads, None_char="") vaf = vcflib._format_vcf_value(call.vaf, None_char="") call_str = vcflib._format_vcf_value(call.call, None_char="") GQ = genodict.get("GQ", "") if GQ in [None, "."]: GQ = "" x = caller_name, filestem, clean_sample, var.chrom, var.pos, \ ref, alt, source, \ num_ref, num_alt, total_reads, vaf, filter_str, call_str, GQ assert len(x) == len(header) x = "\t".join(map(str, x)) lines.append(x) if len(lines) >= 100000: x = "\n".join(lines) + "\n" lock.acquire() handle = open(outfilename, 'a') handle.write(x) handle.close() lock.release() lines = [] x = "\n".join(lines) + "\n" lock.acquire() handle = open(outfilename, 'a') handle.write(x) handle.close() lock.release()