def test_by_id_vcf_in_regions(): from kipoi.postprocessing.variant_effects.utils.generic import default_vcf_id_gen from kipoi.postprocessing.variant_effects.snv_predict import get_variants_in_regions_sequential_vcf vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( "examples/rbp/example_files/variants.vcf") vcf_fh = cyvcf2.VCF(vcf_path, "r") ints1 = {"chr": [], "start": [], "end": [], "strand": [], "id": []} for rec in vcf_fh: ints1["chr"].append(rec.CHROM) ints1["start"].append(rec.POS - 20) ints1["end"].append(rec.POS + 20) ints1["strand"].append("*") ints1["id"].append(default_vcf_id_gen(rec)) vcf_fh.close() vcf_fh = cyvcf2.VCF(vcf_path, "r") model_input = {"metadata": {"gr_a": ints1, "gr_b": ints1}} seq_to_meta = {"seq_a": "gr_a", "seq_a2": "gr_a", "seq_b": "gr_b"} vcf_records, process_lines, process_seq_fields, process_ids = get_variants_in_regions_sequential_vcf( model_input, seq_to_meta, vcf_fh, default_vcf_id_gen) num_entries = len(model_input["metadata"]["gr_a"]["chr"]) assert len(vcf_records) == num_entries assert process_lines == list(range(num_entries)) assert all( [set(el) == set(seq_to_meta.keys()) for el in process_seq_fields]) # # Now imitate bad id in one range: ints2 = copy.deepcopy(ints1) ints2["id"][2] = "" model_input = {"metadata": {"gr_a": ints1, "gr_b": ints2}} seq_to_meta = {"seq_a": "gr_a", "seq_a2": "gr_a", "seq_b": "gr_b"} with pytest.raises(Exception): get_variants_in_regions_sequential_vcf(model_input, seq_to_meta, vcf_fh, default_vcf_id_gen)
def test__overlap_vcf_region(): vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( "examples/rbp/example_files/variants.vcf") vcf_obj = cyvcf2.VCF(vcf_path) all_records = [rec for rec in vcf_obj] vcf_obj.close() vcf_obj = cyvcf2.VCF(vcf_path) # regions_dict = { "chr": ["chr22"], "start": [21541589], "end": [36702137], "id": [0] } regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"], regions_dict["end"], regions_dict["id"]) for regions in [regions_dict, regions_gr]: found_vars, overlapping_region = sp._overlap_vcf_region( vcf_obj, regions, exclude_indels=False) assert all([ str(el1) == str(el2) for el1, el2 in zip(all_records, found_vars) ]) assert len(overlapping_region) == len(found_vars) assert all([el == 0 for el in overlapping_region]) regions_dict = { "chr": ["chr22", "chr22", "chr22"], "start": [21541589, 21541589, 30630220], "end": [36702137, 21541590, 30630222], "id": [0, 1, 2] } regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"], regions_dict["end"], regions_dict["id"]) # plus_indel_results = all_records + all_records[:1] + all_records[3:4] snv_results = [el for el in plus_indel_results if not el.is_indel] # ref_lines_indel = [0] * len(all_records) + [1] + [2] snv_ref_lines = [ el for el, el1 in zip(ref_lines_indel, plus_indel_results) if not el1.is_indel ] # for regions in [regions_dict, regions_gr]: for exclude_indels, ref_res, ref_lines in zip( [False, True], [plus_indel_results, snv_results], [ref_lines_indel, snv_ref_lines]): found_vars, overlapping_region = sp._overlap_vcf_region( vcf_obj, regions, exclude_indels) assert all([ str(el1) == str(el2) for el1, el2 in zip(ref_res, found_vars) if not el1.is_indel ]) assert overlapping_region == ref_lines
def get_snp_genotypes(chromosome, position, samples=None): '''Returns a pandas DataFrame of genotypes, along with phasing status, for a specific SNP. >>> samples = ['HPSI0516i-pebf_2', 'HPSI0516i-zujs_5', 'HPSI1116pf-peru'] >>> df = get_snp_genotypes(1,714439,samples) >>> df chrA chrB phased HPSI0516i-pebf_2 0 0 True HPSI0516i-zujs_5 0 0 True HPSI1116pf-peru 0 0 True ''' vcf_file = '/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20180102.genotypes.chr.{chromosome}.norm.renamed.recode.vcf.gz'.format( chromosome=chromosome) if samples is None: vcf = cyvcf2.VCF(vcf_file) samples = vcf.samples else: if len(samples) != len(set(samples)): raise (ValueError('Duplicated samples in input list')) vcf = cyvcf2.VCF(vcf_file, samples=samples) for sample in samples: if sample not in vcf.samples: raise (KeyError('{} not in vcf'.format(sample))) # reorder samples to match order in which they'll be give by the vcf object samples = vcf.samples query_string = '{chromosome}:{position}-{position}'.format( chromosome=chromosome, position=position) variants = [x for x in vcf(query_string)] # only keep SNPs variants = [x for x in variants if x.is_snp] if len(variants) > 1: error_message = '''Input vcf file contains more than one SNP at position {chromosome}:{position}'''.format( chromosome=chromosome, position=position) raise (ValueError(error_message)) if len(variants) == 0: error_message = '''Input vcf file has no SNP at position {chromosome}:{position}'''.format( chromosome=chromosome, position=position) raise (ValueError(error_message)) var = variants[0] genotype_df = pd.DataFrame(index=samples, columns=['chrA', 'chrB', 'phased'], data=var.genotypes) return genotype_df
def compare_vcfs(fpath1, fpath2): fh1 = cyvcf2.VCF(fpath1) fh2 = cyvcf2.VCF(fpath2) for rec1, rec2 in zip(fh1, fh2): i1 = dict(rec1.INFO) i2 = dict(rec2.INFO) for k in i1: if ':rID' in k: continue min_round = min(len(i1[k]) - i1[k].index(".") - 1, len(i2[k]) - i2[k].index(".") - 1) - 2 # -2 for more tolerance assert np.round(float(i1[k]), min_round) == np.round(float(i2[k]), min_round) fh2.close() fh1.close()
def main(tumour, filter_germline_het, pass_only, just_best, info_af): logging.info('reading from stdin...') # we just want to get all the AFs afs = [] vcf = cyvcf2.VCF('-') sample_id = vcf.samples.index(tumour) germline_id = 1 if sample_id == 0 else 0 logging.debug('sample_id %i germline_id %i', sample_id, germline_id) skipped = 0 gaf_range = (1.0, 0.0) for v in cyvcf2.VCF('-'): if pass_only and v.FILTER is not None: logging.debug('skipping non-pass at %s:%s', v.CHROM, v.POS) skipped += 1 continue # gl af if not info_af: gaf = v.format('AF')[germline_id][0] # mutect2 af gaf_range = (min([gaf_range[0], gaf]), max([gaf_range[1], gaf])) logging.debug('gaf %s range %s', gaf, gaf_range) if filter_germline_het and GL_HET[0] < gaf < GL_HET[1]: logging.debug('skipping germline het at %s:%s', v.CHROM, v.POS) skipped += 1 continue # tumour af if info_af: af = v.INFO['AF'] # calculated af else: af = v.format('AF')[sample_id][0] # mutect2 af logging.debug('appending %s to afs', af) afs.append(af) if len(afs) == 0: logging.warn('No afs') answer = [0.0, 0.5, 1.0] else: logging.debug('%i afs: %s', len(afs), afs) answer = numpy.percentile(afs, PERCENTILES) if just_best: sys.stdout.write('{:.2f}'.format(answer[1])) else: sys.stdout.write('Lower\tBest\tUpper\n') sys.stdout.write('{:.2f}\t{:.2f}\t{:.2f}\n'.format( answer[0], answer[1], answer[2])) logging.info('done. skipped %i included %i. gaf range %s', skipped, len(afs), gaf_range)
def main(): input, vaf_threshold, output_dir = argument_parser() outputfile = os.path.join( output_dir, re.sub('.vcf$', '.filtered.vcf', os.path.basename(input))) vcf_handle = cyvcf2.VCF(input) print(vcf_handle) writer = cyvcf2.Writer(outputfile, vcf_handle) for variant in cyvcf2.VCF(input): if variant.INFO['PON_VAF'] < vaf_threshold: writer.write_record(variant) vcf_handle.close() writer.close()
def main(): input_vcf, reference, output_dir = argument_parser() output_vcf = os.path.join( output_dir, re.sub('.vcf$', '.sig9.vcf', os.path.basename(input_vcf))) vcf_handle = cyvcf2.VCF(input_vcf) output_vcf_handle = cyvcf2.Writer(output_vcf, vcf_handle) for variant in cyvcf2.VCF(input_vcf): var_position = Position(variant.CHROM, variant.POS, variant.POS) refbase, altbase, var_trinucleotide = get_trinucleotide( var_position, variant.REF, variant.ALT[0], reference) if var_trinucleotide in ['TTT', 'TTA', 'CTT'] and altbase == 'G': output_vcf_handle.write_record(variant)
def unphase(inVcf, outVcf): # read the vcf with scikit-allel, just to get number of snps print("[GET_NR_SNPS]") print(f"Reading: {inVcf}") startTime = time.perf_counter() callset = allel.read_vcf(inVcf) print(f"Took {(time.perf_counter() - startTime):.2f} seconds.") # no tri-allelic? assert (sum(callset["variants/ALT"][:, 2] != '') == 0) assert (sum(callset["variants/ALT"][:, 1] != '') == 0) assert (sum(callset["variants/ALT"][:, 0] == '') == 0) snpsInFile = callset["calldata/GT"].shape[0] print(snpsInFile) print("[DONE]") print("[UNPHASE]") print(f"File to unphase: {inVcf}") print(f"Unphased output written to: {outVcf}") # go through the vcf vcfIFS = cyvcf2.VCF(inVcf) # get some randomness numIndividuals = len(vcfIFS.samples) randomness = numpy.random.randint(2, size=(numIndividuals, snpsInFile)) # create a new vcf Writer using the input vcf as a template. vcfOFS = cyvcf2.Writer(outVcf, vcfIFS) count = 0 allIdxs = numpy.arange(numIndividuals) for v in vcfIFS: # see what goes # what are the indices to be flipped? toFlip = allIdxs[randomness[:, count] == 1] for idx in toFlip: # flip it v.genotypes[idx][0], v.genotypes[idx][1] = v.genotypes[idx][ 1], v.genotypes[idx][0] # make sure we have new genotypes v.genotypes = v.genotypes # and write it vcfOFS.write_record(v) # increase count count += 1 if (count % 100000 == 0): print(count) vcfOFS.close() vcfIFS.close() print("[DONE]")
def filter_to_pass_and_reject(in_file, paired, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ from bcbio.heterogeneity import bubbletree out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired) tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: reader = _add_db_to_header(reader) with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]] if len(other_filters) == 0 or bubbletree.is_info_germline(rec): # Germline, check if we should include based on frequencies if "REJECT" in filters or bubbletree.is_info_germline(rec): stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired, use_status=True, max_normal_depth=max_depth) if stats: rec.FILTER = "PASS" rec.INFO["DB"] = True writer.write_record(rec) # Somatic, always include else: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"]) return out_file
def main(in_file): in_cyvcf = cyvcf2.VCF(in_file) writer = csv.writer(sys.stdout) writer.writerow(["chrom", "start", "end", "svtype", "samples", "size", "gene", "annotation", "detail"]) for rec in in_cyvcf: calls = [parse_name(s) for s, gt in zip(in_cyvcf.samples, rec.gt_bases) if _has_call(gt)] anns = [x.split("|") for x in rec.INFO.get("SIMPLE_ANN", "").split(",")] svtypes = set([]) all_genes = set([]) annotations = set([]) details = set([]) for svtype, annotation, genes, _, detail, _ in (x for x in anns if x and len(x) > 1): if detail != "NOT_PRIORITISED": for c in "'[]' ": svtype = svtype.replace(c, "") svtypes.add(svtype) all_genes.add(genes) annotations.add(annotation) details.add(detail) if svtypes: start = int(rec.POS) end = rec.INFO.get("END") size = abs(rec.INFO.get("SVLEN", end - start)) writer.writerow([rec.CHROM, start, end, _combine(svtypes), size, ";".join(calls), _combine(all_genes), _combine(annotations), _combine(details)])
def main(): """Run main function.""" args = parse_args(sys.argv[1:]) # ========================================================================= # Gather args # ========================================================================= vcf_path = args.vcf outfile = args.outfile threads = args.threads label_by = args.pops_header meta = pd.read_csv(args.meta, sep="\t", index_col="sampleID", dtype=object) # ========================================================================= # Main executions # ========================================================================= vcf = cyvcf2.VCF(vcf_path) with tsinfer.SampleData(path=f"{outfile}.samples", sequence_length=chrom_len(vcf), num_flush_threads=threads, max_file_size=2**37) as samples: add_metadata(vcf, samples, meta, label_by) add_diploid_sites(vcf, samples) print( f"Sample file created for {samples.num_samples} samples ({samples.num_individuals}) with {samples.num_sites} variable sites.", flush=True)
def load_vcf( input_vcf, threads=1, aaf_thresh=0.0 ): # the function "load_vcf" has arguments, namely the name of the input_vcf, # how many threads to use ( computing) # and a threshold for alternative allele frequency the default threshold is 0.0 """ load a vcf """ vcf = cyvcf2.VCF(input_vcf, gts012=True, threads=threads) # load the vcf gts = [] #init empty list for genotype_entries aaf = [] #init empty list for alternative allele frequencies chr_pos = [] #init empty list for positions for variant in vcf: # for each variant/position, do: if variant.aaf > aaf_thresh: # if the alt. allele freq. is above threshold: gts.append( variant.gt_types.astype(int)) # append genotype array to gts chr_pos.append(variant.POS) # append position to position-list aaf.append(variant.aaf) # append aaf to alt. allele. freq. list gt_array = np.array( gts ) # make list of per-position arrays into SAMPLE x POS rectangular matrix samples = vcf.samples # extract list of sample names from vcf return aaf, [chr_pos, samples, gt_array ] # return aaf, and genotype matrix with column and row names
def findEmptyRegions(vcfFN, bedFN, filteredFN): print(vcfFN) print(bedFN) vcf = cyvcf2.VCF(vcfFN) fp = open(bedFN, 'r') fpo = open(filteredFN, 'w+') found = 0 empty = 0 for l in fp: fields = l.rstrip().split('\t') chrom, start, end = fields c = False for var in vcf(f'{chrom}:{start}-{end}'): c = True break if c: #print(*fields, sep='\t') found += 1 fpo.write(l) else: empty += 1 fpo.close() fp.close() print(found, empty, sep='\t')
def handle(self, *args, **options): variant_collection_id = options['variant_collection_id'] variant_collection = VariantCollection.objects.get( pk=variant_collection_id) logging.debug("Inserting variant_collection_id = %d", variant_collection_id) try: vcf_reader = cyvcf2.VCF("/dev/stdin") # Must take a filename.. bulk_inserter = BulkVCFCountInserter(variant_collection) for v in vcf_reader: bulk_inserter.process_entry(v) bulk_inserter.finish() # Any leftovers variant_collection.count = bulk_inserter.rows_processed variant_collection.save() except Exception: details = get_traceback() logging.error(details) try: node = variant_collection.intersectioncache.node_version.node node.status = NodeStatus.ERROR errors = "Error inserting variants after bed intersection:\n" errors += details logging.error(errors) node.errors = errors node.save() except Exception as e: logging.error(e) create_event(None, name="stdin_to_variant_collection", details=details)
def vcf2SVPosition(vcf_file): ''' create a generator of Position object of break points from a given vcf file added Manta/Lumpy support Dec 21 2017, and commented out old function above March 3 2018 edit. Created Dictionary to call each SV types separately. ''' bp_dict = dict({'BND':[], 'DUP':[], 'INS':[], 'DEL':[], 'INV':[]}) for variant in cyvcf2.VCF(vcf_file): # if variant.FILTER == None: variant_type = variant.INFO.get('SVTYPE') if variant_type == "BND": # this one can be used for any SV VCF with BND type bnd_pos= re.search(string=variant.ALT[0], pattern=r'[a-zA-Z]*[0-9]*:[0-9]+').group(0) bnd_chrom, bnd_pos = bnd_pos.split(':') bp2String = f'{bnd_chrom}:{bnd_pos}-{int(bnd_pos) + 1}' elif variant_type == "TRA": # this one is specific for Delly v0.7.6 annotation bp2String = f"{variant.INFO.get('CHR2')}:{variant.INFO.get('END')}-{variant.INFO.get('END') + 1}" else: bp2String = f'{variant.CHROM}:{variant.INFO.get("END")}-{variant.INFO.get("END") + 1}' bp1 = Position.fromstring(f'{variant.CHROM}:{variant.POS}-{variant.POS + 1}') bp2 = Position.fromstring(bp2String) bp_dict[variant_type].append((bp1, bp2)) return bp_dict
def main(out): logging.info('reading from stdin...') stats_all = collections.defaultdict(int) stats_exon = collections.defaultdict(int) stats_onco = collections.defaultdict(int) stats_exon_onco = collections.defaultdict(int) for variant in cyvcf2.VCF('-'): net = len(variant.ALT[0]) - len(variant.REF) stats_all[net] += 1 if variant.INFO.get('msi_exon') is not None: stats_exon[net] += 1 if variant.INFO.get('msi_oncogene') is not None: stats_exon_onco[net] += 1 if variant.INFO.get('msi_oncogene') is not None: stats_onco[net] += 1 out.write('Change\tAll\tExon\tOnco\tExonOnco\n') for stat in sorted(stats_all.keys()): out.write('{change}\t{total}\t{exon}\t{onco}\t{exon_onco}\n'.format( change=stat, total=stats_all[stat], exon=stats_exon[stat], onco=stats_onco[stat], exon_onco=stats_exon_onco[stat])) logging.info('done')
def __init__(self, vcf_path, db_path, ped_path=None, blobber=pack_blob, black_list=None, expand=None): self.vcf_path = vcf_path self.db_path = get_dburl(db_path) self.engine = sql.create_engine(self.db_path, poolclass=sql.pool.NullPool) self.impacts_headers = {} self.metadata = sql.MetaData(bind=self.engine) self.expand = expand or [] self.stringers = [] self.af_cols = [] # track these to set to -1 self.extra_columns = [] self.blobber = blobber self.ped_path = ped_path self.black_list = list(VCFDB._black_list) + list( VCFDB.effect_list) + (black_list or []) self.vcf = cyvcf2.VCF(vcf_path) # we use the cache to infer the lengths of string fields. self.cache = it.islice(self.vcf, 10000) self.create_columns() self.samples = self.create_samples() self.load() self.index()
def main(threshold, common_in, out, position_only): # TODO position_only = true only supported common = set() first = True logging.info('reading common variants from %s', common_in) for line in open(common_in, 'r'): if first: first = False continue fields = line.strip('\n').split('\t') prop = float(fields[3]) if prop >= threshold: common.add('{}\t{}'.format(fields[0], fields[1])) logging.info('reading vcf from stdin') vcf = cyvcf2.VCF('-') filtered = total = 0 out.write(vcf.raw_header) for total, variant in enumerate(vcf): if '{}\t{}'.format(variant.CHROM, variant.POS) in common: filtered += 1 else: out.write(str(variant)) logging.info('filtered %i of %i', filtered, total + 1)
def readVCF(self): """ read the whole vcf file into memory and create the dictionnary that contains the mutations loci normally somatic calls are smaller fiels compared to germline calls and therefore can be hold in memory even if you have up to 10 vcfs with 1 million lines each. This will of course require up to 16GB """ return (cyvcf2.VCF(self.fvcf))
def main(sample, chrom, pos, nofilter): logging.info('reading from stdin...') vcf_in = cyvcf2.VCF('-') sample_id = vcf_in.samples.index(sample) for variant in vcf_in: if not nofilter and variant.FILTER is not None: continue if variant.POS == pos and variant.CHROM == chrom: # check gt 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT gt = variant.gt_types[sample_id] if gt == 1 or gt == 3: ad = variant.format('AD')[sample_id] gt_str = ['0/0', '0/1', './.', '1/1'][gt] sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( sample, chrom, pos, ', '.join([str(x) for x in ad]), gt_str, '1')) logging.info('done') sys.exit(0) # not found sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(sample, chrom, pos, 'NA', 'NA', '0')) logging.info('done')
def get_vep_scores(vcf_name, vep_vcf_key="CSQ", sel_vep_keys=["phyloP46way_placental", "phyloP46way_primate", "CADD_phred", "CADD_raw"]): vcf_fh = cyvcf2.VCF(vcf_name) # get the correct elements for hdr in vcf_fh.header_iter(): hdr_info = hdr.info() if 'ID' in hdr_info: if hdr_info['ID'] == vep_vcf_key: vep_keys = hdr_info['Description'].split(": ")[-1].rstrip('"').split("|") break sel_vep_elms = [vep_keys.index(k) for k in sel_vep_keys] info_tags = [] entries = [] # Iterate over all entries and extract the `info_tag` if set, otherwise return all INFO tags for rec in vcf_fh: info_dict = dict(rec.INFO) if vep_vcf_key in info_dict: vep_entries = info_dict[vep_vcf_key].split(",")[0].split("|") variant_uid = ":".join([rec.CHROM, str(rec.POS), rec.REF, rec.ALT[0]]) vals = [vep_entries[i] for i in sel_vep_elms] entries.append(pd.Series([vep_entries[i] for i in sel_vep_elms], name = variant_uid, index = sel_vep_keys)) # Turn into a data frame df = pd.DataFrame(entries,) df = df.replace("", "nan").astype(float) # dedup df = df.loc[~pd.Series(df.index.values).duplicated().values,:] return df
def ksfs(args): """subroutine for ksfs subcommand """ vcf = cyvcf2.VCF(args.vcf) ksfs_data = defaultdict(lambda: Counter()) AN = None for variant in vcf: # AN must be the same for all sites (no missing genotypes) if AN is not None and variant.INFO['AN'] != AN: raise ValueError(f'different AN {variant.INFO["AN"]} and {AN}' ' indicates missing genotypes') AN = variant.INFO['AN'] ksfs_data[variant.INFO['mutation_type']][variant.INFO['AC']] += 1 # exclude fixed sites AC=0, AC=AN index = range(1, AN) for mutation_type in sorted(ksfs_data): ksfs_data[mutation_type] = [ ksfs_data[mutation_type][ac] for ac in index ] ksfs = pd.DataFrame(ksfs_data, index).reindex(sorted(ksfs_data), axis='columns') try: print(ksfs.to_csv(sep='\t', index=True, index_label='sample_frequency')) except BrokenPipeError: pass
def variants(vcf_path, show_progress=False): output = subprocess.check_output( ["bcftools", "index", "--nrecords", vcf_path]) num_rows = int(output) progress = tqdm.tqdm(total=num_rows, disable=not show_progress) vcf = cyvcf2.VCF(vcf_path) num_diploids = len(vcf.samples) num_samples = 2 * num_diploids j = 0 for row in filter_duplicates(vcf): progress.update() ancestral_state = None try: aa = row.INFO["AA"] # Format = AA|REF|ALT|IndelType splits = aa.split("|") if len(splits) == 4 and len(splits[0]) == 1: base = splits[0].upper() if base in "ACTG": ancestral_state = base except KeyError: pass if row.num_called == num_diploids and ancestral_state is not None: a = np.zeros(num_samples, dtype=np.uint8) if row.is_snp and len(row.ALT) == 1: # Fill in a with genotypes. bases = row.gt_bases for j in range(num_diploids): a[2 * j] = bases[j][0] != ancestral_state a[2 * j + 1] = bases[j][2] != ancestral_state yield Variant(position=row.POS, genotypes=a) vcf.close()
def read_vcf(fn, pass_only, dp_threshold, info_af): logging.info('reading vcf from stdin...') skipped_dp = skipped_pass = 0 vcf_in = cyvcf2.VCF(fn) values = [] for variant_count, variant in enumerate(vcf_in): # calculate vaf if len(variant.ALT) > 1: logging.warn('variant %i is multi-allelic', variant_count + 1) is_pass = variant.FILTER is None or variant.FILTER == 'alleleBias' if pass_only and not is_pass: skipped_pass += 1 continue if variant.INFO["DP"] < dp_threshold: # somatic + germline skipped_dp += 1 continue if info_af: value = variant.INFO["AF"] else: ad = variant.format("AD")[sample_id] ref = ad[0] alt = ad[1] if ref + alt > 0: value = alt / (ref + alt) else: value = 0 values.append(value) return values
def main(qual, af, dp): logging.info( 'reading vcf from stdin. qual filter %i af filter %f dp filter %i', qual, af, dp) vcf_in = cyvcf2.VCF('-') sys.stdout.write(vcf_in.raw_header) allowed = 0 denied = 0 for variant in vcf_in: ok = (variant.QUAL is None or variant.QUAL >= qual ) and variant.INFO["AF"] >= af and variant.INFO["DP"] >= dp if ok: sys.stdout.write(str(variant)) allowed += 1 else: denied += 1 if (allowed + denied) % 100000 == 0: logging.debug('%i processed. %i allowed.', allowed + denied, allowed) logging.info('done. wrote {}. skipped {}. total {}'.format( allowed, denied, allowed + denied))
def _remove_prioritization(in_file, data, out_dir=None): """Remove tumor-only prioritization and return non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate( out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(str(in_file)) reader.add_filter_to_header({ 'ID': 'Somatic', 'Description': 'Variant called as Somatic' }) # with open(tx_out_file, "w") as out_handle: # out_handle.write(reader.raw_header) with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: for rec in reader: rec = _update_prioritization_filters(rec) # out_handle.write(str(rec)) writer.write_record(rec) return out_file
def process_metadata(self, metadata_file, show_progress=False): """ Adds the Max Planck metadata. """ with open(metadata_file, "r") as max_planck_metadata: # Parse the individual metadata out of the file. lines = max_planck_metadata.read().splitlines() metadata = {} row = lines[1].split(" ") name = row[0] metadata["name"] = name metadata["age"] = int(row[2]) / GENERATION_TIME population = row[1] vcf = cyvcf2.VCF(self.data_file) individual_names = list(vcf.samples) vcf.close() self.num_samples = len(individual_names) * 2 pop_id = self.samples.add_population({ "name": population, "super_population": "Max Planck" }) self.samples.add_individual(time=metadata["age"], metadata=metadata, population=pop_id, ploidy=2)
def main(gene_filter): logging.info('starting...') # 1 45794974 479982 G C . . AF_EXAC=0.00001;ALLELEID=472328;CLNDISDB=MedGen:C0027672,SNOMED_CT:699346009|MedGen:CN517202;CLNDN=Hereditary_cancer-predisposing_syndrome|not_provided;CLNHGVS=NC_000001.10:g.45794974G>C;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=MUTYH:4595;MC=SO:0001619|non-coding_transcript_variant,SO:0001624|3_prime_UTR_variant;ORIGIN=1;RS=758118037 stat = collections.defaultdict(int) genes = collections.defaultdict(int) for count, variant in enumerate(cyvcf2.VCF('-')): try: gene = variant.INFO['GENEINFO'] except KeyError: gene = 'notspecified' genes[gene] += 1 gene_name = gene.split(':')[0] if gene_filter is not None and gene_name != gene_filter: continue stat[variant.INFO['CLNSIG']] += 1 if count % 10000 == 0: logging.debug('%s processed...', count) sys.stdout.write('CLNSIG\tCount\tPct\n') total = sum([stat[c] for c in stat]) for c in stat: sys.stdout.write('{}\t{}\t{:.3f}\n'.format(c, stat[c], stat[c] / total)) #logging.info(genes) logging.info('done')
def main(manta_vcf, truth_bed): BUFFER = 1000 manta_bed = manta_vcf.replace(".vcf", ".bed") with open(manta_bed, "w") as out_handle: for rec in cyvcf2.VCF(manta_vcf): if not rec.FILTER and rec.gt_types[0] in set([1, 3]): out_handle.write( "%s\t%s\t%s\n" % (rec.CHROM, max(0, rec.start - BUFFER), rec.end + BUFFER)) manta_compare = "%s-giab.bed" % (os.path.splitext(manta_bed)[0]) cmd = "bedtools intersect -c -wa -a {truth_bed} -b {manta_bed} > {manta_compare}" subprocess.check_call(cmd.format(**locals()), shell=True) counts = collections.defaultdict(int) totals = collections.defaultdict(int) with open(manta_compare) as in_handle: for chrom, start, end, svtype, info, matches in (l.strip().split("\t") for l in in_handle): totals[svtype] += 1 if int(matches) > 0: counts[svtype] += 1 for svtype, total in totals.items(): print("| %s | %s (%.1f%%) |" % (svtype, counts[svtype], float(counts[svtype]) / total * 100.0))
def add_to_database(self): self.vcf_reader = cyvcf2.VCF(self.f, gts012=True) self._create_new_variant_set() self._create_variant_set_meta_data() self._create_call_sets() self._create_variants_and_calls() self.vcf_reader.close()