def filter_on_reciprocal_overlap(single_sample_vcf_file, ref_vcf_file, svtype, case_sample, overlap_frac, variant_gts_allowed): single_sample_vcf = single_sample_vcf_file ref_vcf = ref_vcf_file single_sample_bed = svu.vcf2bedtool(single_sample_vcf, annotate_ins=False, include_samples=True, svtypes=[svtype]) ref_bed = svu.vcf2bedtool(ref_vcf, annotate_ins=False, include_samples=True, svtypes=[svtype]) ref_bed.filter(ac_filter, variant_gts_allowed=variant_gts_allowed, sample_to_exclude=case_sample) intersection = single_sample_bed.intersect(ref_bed, wa=True, f=overlap_frac, r=True, v=True) filtered_variant_ids = [] for intx in intersection: filtered_variant_ids.append(intx.name) return filtered_variant_ids
def integrate_melt(cxsv, melt, fout, window=100): cxsv_bed = svu.vcf2bedtool(cxsv, annotate_ins=False, include_samples=True) melt_bed = svu.vcf2bedtool(melt, annotate_ins=False, include_samples=True) sect = cxsv_bed.window(melt_bed, w=window) # Check breakpoints are within window def close_enough(interval): startA, endA = [int(x) for x in interval.fields[1:3]] startB, endB = [int(x) for x in interval.fields[8:10]] return abs(startA - startB) < window and abs(endA - endB) < window excluded_cxsv = deque() for interval in sect.intervals: samplesA = interval.fields[6].split(',') samplesB = interval.fields[13].split(',') if (samples_overlap(samplesA, samplesB) and close_enough(interval) and interval.fields[4] == 'INS'): excluded_cxsv.append(interval.fields[3]) cxsv.reset() melt.reset() for record in heapq.merge(cxsv, melt, key=lambda record: record.pos): if record.id in excluded_cxsv: # print(record.id) continue fout.write(record)
def link_inv(vcf, bkpt_window=300, cpx_dist=2000): bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False) overlap = bt.window(bt, w=bkpt_window).saveas() overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[ 10] == "DEL")).saveas() overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[ 10] == "DUP")).saveas() links = [(b[3], b[9]) for b in overlap.intervals] linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) linked_IDs = np.array(linked_IDs) bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)} indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links]) n_bkpts = len(linked_IDs) bkpts = extract_breakpoints(vcf, bkpt_idxs) # Exclude wildly disparate overlaps G = sps.eye(n_bkpts, dtype=np.uint16, format='lil') for i, j in indexed_links: if (samples_overlap(bkpts[i], bkpts[j]) and ro_calu(bkpts[i], bkpts[j]) > 0): G[i, j] = 1 # Generate lists of clustered breakpoints n_comp, comp_list = sps.csgraph.connected_components(G) clusters = [deque() for x in range(n_comp)] for i, c_label in enumerate(comp_list): clusters[c_label].append(bkpts[i]) return clusters
def link_cpx(vcf, bkpt_window=300, cpx_dist=2000): """ Parameters ---------- vcfpath : str Path to breakpoint VCF """ bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False) # Identify breakpoints which overlap within specified window overlap = bt.window(bt, w=bkpt_window).saveas() # Exclude self-hits # overlap = overlap.filter(lambda b: b.fields[3] != b.fields[9]).saveas() # Exclude intersections where two DELs or two DUPs cluster together # cnvtypes = 'DEL DUP'.split() overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas() overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas() # # Exclude intersections with annotated mobile elements (rather than BNDs) # overlap = overlap.filter(lambda b: b.fields[4] is not re.match(re.compile('INS\:ME\:*'), b.fields[4])).saveas() # Get linked variant IDs links = [(b[3], b[9]) for b in overlap.intervals] linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) linked_IDs = np.array(linked_IDs) # Map variant IDs to indices bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)} indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links]) # Extract VariantRecords corresponding to breakpoints n_bkpts = len(linked_IDs) bkpts = extract_breakpoints(vcf, bkpt_idxs) # Exclude wildly disparate overlaps # Build sparse graph from links G = sps.eye(n_bkpts, dtype=np.uint16, format='lil') for i, j in indexed_links: if (samples_overlap(bkpts[i], bkpts[j]) and close_enough(bkpts[i], bkpts[j])): G[i, j] = 1 # Generate lists of clustered breakpoints n_comp, comp_list = sps.csgraph.connected_components(G) clusters = [deque() for x in range(n_comp)] for i, c_label in enumerate(comp_list): clusters[c_label].append(bkpts[i]) # # Remove clusters of only CNV - leftover from shared sample filtering # def _ok_cluster(cluster): # ok = any([record.info['SVTYPE'] not in cnvtypes for record in cluster]) # return ok # clusters = [c for c in clusters if _ok_cluster(c)] # clusters = [c for c in clusters if len(c) > 1] return clusters
def link_cpx(vcf, bkpt_window=300): """ Parameters ---------- vcfpath : str Path to breakpoint VCF """ bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False) # Identify breakpoints which overlap within specified window overlap = bt.window(bt, w=bkpt_window).saveas() # Exclude intersections where two DELs or two DUPs cluster together overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[ 10] == "DEL")).saveas() overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[ 10] == "DUP")).saveas() # Get linked variant IDs links = [(b[3], b[9]) for b in overlap.intervals] linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) linked_IDs = np.array(linked_IDs) # Map variant IDs to indices bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)} indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links]) # Extract VariantRecords corresponding to breakpoints n_bkpts = len(linked_IDs) bkpts = extract_breakpoints(vcf, bkpt_idxs) # Build called sample index # Get lists of called samples for each record sample_sets_dict = { idx: set(svu.get_called_samples(bkpts[idx])) for idx in set(indexed_links.flatten().tolist()) } # Exclude wildly disparate overlaps # Build sparse graph from links G = sps.eye(n_bkpts, dtype=np.uint16, format='lil') for i, j in indexed_links: if (samples_overlap(sample_sets_dict[i], sample_sets_dict[j]) and close_enough(bkpts[i], bkpts[j])): G[i, j] = 1 # Generate lists of clustered breakpoints n_comp, comp_list = sps.csgraph.connected_components(G) clusters = [deque() for x in range(n_comp)] for i, c_label in enumerate(comp_list): clusters[c_label].append(bkpts[i]) return clusters
def annotate_vcf(vcf, gencode, noncoding, annotated_vcf): """ Parameters ---------- vcf : pysam.VariantFile gencode : pbt.BedTool Gencode gene annotations noncoding : pbt.BedTool Noncoding elements annotated_vcf : str Path to output VCF """ # Add metadata lines for annotations header = vcf.header if gencode is not None: for line in GENCODE_INFO: header.add_line(line) if noncoding is not None: for line in NONCODING_INFO: header.add_line(line) # Open output file fout = pysam.VariantFile(annotated_vcf, 'w', header=header) # Annotate genic hits if isinstance(vcf.filename, bytes): fname = vcf.filename.decode() else: fname = vcf.filename sv = svu.vcf2bedtool(fname, split_bnd=True, split_cpx=True, simple_sinks=True, include_unresolved=False) effects = annotate(sv, gencode, noncoding) effects = effects.to_dict(orient='index') # Add results to variant records and save for record in vcf: anno = effects.get(record.id) if anno is None: fout.write(record) continue # Handle general catch-all intersection for MULTIALLELIC variants if 'MULTIALLELIC' in record.filter: multi_ovr = [] for info, genelist in anno.items(): if info in 'LOF DUP_LOF COPY_GAIN DUP_PARTIAL'.split(): if genelist != 'NA': for gene in genelist.split(','): if gene not in multi_ovr: multi_ovr.append(gene) else: if genelist != 'NA': record.info[info] = genelist if len(multi_ovr) > 0: record.info['MSV_EXON_OVR'] = ','.join(multi_ovr) else: for info, genelist in anno.items(): if genelist != 'NA': record.info[info] = genelist if 'NEAREST_TSS' in record.info: record.info['INTERGENIC'] = True fout.write(record) fout.close()
def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype, case_sample, overlap_frac, variant_gts_allowed): single_sample_vcf = single_sample_vcf_file ref_vcf = ref_vcf_file single_sample_bed = svu.vcf2bedtool(single_sample_vcf, annotate_ins=False, include_samples=True, svtypes=[svtype]) ref_bed = svu.vcf2bedtool(ref_vcf, annotate_ins=False, include_samples=True, svtypes=[svtype]) # in bash bedtools this gets the results we want: # bedtools coverage -a single_sample_calls.bed -b ref_panel_calls.bed -d \ # compute per-base coverage of query by intervals in ref # | awk '{OFS="\t"; print $1,$2,$3,$8,$9}' \ # slim down the of data by removing sample list, extra fields # | bedtools groupby -g 1,2,3,5 -c 4 -o min,max \ # group together regions with the same coverage value # | awk '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}' \ # make these regions into new bed intervals # | bedtools intersect -a stdin -b ref_panel_calls.bed -wb \ # print out the ref intervals that overlapped these regions # | bedtools groupby -g 1,2,3 -c 10 -o distinct\ # condense the sample lists # | bedtools intersect -a single_sample_calls.bed -b stdin -wao # intersect with the query, printing the amt of overlap # # pybedtools unable to handle this pipeline without blowing up disk space due to lack of working streaming support # # subprocess streaming equivalent single_sample_bed.saveas('single_sample_calls.bed') ref_bed.saveas('ref_panel_calls.bed') cov_hist = subprocess.Popen([ 'bedtools', 'coverage', '-a', 'single_sample_calls.bed', '-b', 'ref_panel_calls.bed', '-d' ], stdout=subprocess.PIPE) cov_hist_slim = subprocess.Popen( ['awk', '{OFS="\t"; print $1,$2,$3,$8,$9}'], stdin=cov_hist.stdout, stdout=subprocess.PIPE) cov_reg_grouped = subprocess.Popen( ['bedtools', 'groupby', '-g', '1,2,3,5', '-c', '4', '-o', 'min,max'], stdin=cov_hist_slim.stdout, stdout=subprocess.PIPE) cov_reg_grp_fix = subprocess.Popen( ['awk', '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}'], stdin=cov_reg_grouped.stdout, stdout=subprocess.PIPE) cov_reg_ref_ovl = subprocess.Popen([ 'bedtools', 'intersect', '-a', 'stdin', '-b', 'ref_panel_calls.bed', '-wb' ], stdin=cov_reg_grp_fix.stdout, stdout=subprocess.PIPE) cov_reg_ref_cds = subprocess.Popen( ['bedtools', 'groupby', '-g', '1,2,3', '-c', '10', '-o', 'distinct'], stdin=cov_reg_ref_ovl.stdout, stdout=subprocess.PIPE) final_intersect_process = subprocess.Popen( [ 'bedtools', 'intersect', '-a', 'single_sample_calls.bed', '-b', 'stdin', '-wao' ], stdin=cov_reg_ref_cds.stdout, stdout=open('final_merged_intersection.bed', 'w')) data = final_intersect_process.communicate()[0] # expect this to be empty return_code = final_intersect_process.returncode if return_code != 0: raise Exception( 'intersection pipeline process exited with return code ' + returncode) intersection = pybedtools.BedTool('final_merged_intersection.bed') filtered_variant_ids = [] current_case_id = '' has_ref_panel_gts = False bases_covered_by_matching_calls = 0 current_case_length = -1 for intx in intersection: new_case_id = intx.name if new_case_id != current_case_id: if current_case_id != '': covered_by_matching_case_calls = ( bases_covered_by_matching_calls / current_case_length) > overlap_frac if has_ref_panel_gts and not covered_by_matching_case_calls: filtered_variant_ids.append(current_case_id) current_case_id = new_case_id has_ref_panel_gts = False bases_covered_by_matching_calls = 0 current_case_length = intx.end - intx.start variant_samples = set(intx.fields[6].split(',')) if case_sample in variant_samples: variant_samples.remove(case_sample) if len(variant_samples) > variant_gts_allowed: has_ref_panel_gts = True if intx.fields[7] != ".": ref_panel_gts = set(intx.fields[10].split(',')) if len(variant_samples - ref_panel_gts) <= variant_gts_allowed: bases_covered_by_matching_calls += int(intx.fields[11]) return filtered_variant_ids
def vcf2bed(argv): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='VCF to convert.') parser.add_argument('bed', help='Converted bed. Specify `-` or `stdout` ' 'to write to stdout.') parser.add_argument('--no-samples', dest='include_samples', action='store_false', default=True, help='Don\'t include comma-delimited list of called ' 'samples for each variant.') parser.add_argument('-i', '--info', action='append', help='INFO field to include as column in output. ' 'May be specified more than once. To include all INFO ' 'fields, specify `--info ALL`. INFO fields are ' 'reported in the order in which they are requested. ' 'If ALL INFO fields are requested, they are reported ' 'in the order in which they appear in the VCF header.') parser.add_argument('--include-filters', action='store_true', default=False, help='Include FILTER status in output, with the same ' + 'behavior an INFO field.') parser.add_argument('--split-bnd', action='store_true', default=False, help='Report two entries in bed file for each BND.') parser.add_argument('--split-cpx', action='store_true', default=False, help='Report entries for each CPX rearrangement interval.') parser.add_argument('--no-header', dest='header', action='store_false', default=True, help='Suppress header.') parser.add_argument('--no-sort-coords', dest='no_sort_coords', action='store_true', default=False, help='Do not sort start/end coordinates ' 'per record before writing to bed.') parser.add_argument('--no-unresolved', dest='no_unresolved', action='store_true', default=False, help='Do not output unresolved variants.') parser.add_argument('--simple-sinks', dest='simple_sinks', action='store_true', default=False, help='Report all INS sinks as 1bp intervals.') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.vcf in '- stdin'.split(): vcf = pysam.VariantFile(sys.stdin) else: vcf = pysam.VariantFile(args.vcf) header = '#chrom start end name svtype'.split() if args.include_samples: header.append('samples') if args.info: if 'ALL' in args.info: header = header + vcf.header.info.keys() else: header = header + args.info if args.include_filters: header = header + ['FILTER'] header = '\t'.join(header) include_unresolved = not args.no_unresolved bt = svu.vcf2bedtool(vcf, split_bnd=args.split_bnd, include_samples=args.include_samples, include_strands=False, split_cpx=args.split_cpx, include_infos=args.info, annotate_ins=False, report_alt=True, no_sort_coords=args.no_sort_coords, simple_sinks=args.simple_sinks, include_unresolved=include_unresolved, include_filters=args.include_filters) if args.bed in 'stdout -'.split(): if args.header: sys.stdout.write(header + '\n') sys.stdout.write(str(bt)) else: if args.header: bt.saveas(args.bed, trackline=header) else: bt.saveas(args.bed)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.8): # Memory inefficient but it's easier and shouldn't matter too much # now that the variants have been filtered down records = dict() records['pesr'] = {record.id: record for record in pesr_vcf} records['depth'] = {record.id: record for record in depth_vcf} # Wipe MEMBERS from prior clustering for source in 'pesr depth'.split(): for ID, record in records[source].items(): record.info['MEMBERS'] = [ID] # Reset for bedtool creation pesr_vcf.reset() depth_vcf.reset() pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False, include_strands=False) depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False, include_strands=False) # Merge depth records with PE/SR records if they share 80% recip overlap sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac) filtered_depth_IDs = deque() for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[9]: continue pesr_id, depth_id = pair.fields[3], pair.fields[8] # Add depth record's samples to PE/SR filtered_depth_IDs.append(depth_id) pesr_record = records['pesr'][pesr_id] depth_record = records['depth'][depth_id] # Update metadata and samples pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] + (depth_record.id, )) pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', ) add_samples(pesr_record, depth_record) # Remove overlapping depth records (not performed in for loop to account # for double overlaps # TODO: handle double overlap of depth calls for ID in set(filtered_depth_IDs): records['depth'].pop(ID) # In remaining depth-only calls, add samples to PE/SR record if the # record covers 90% of the depth-only call. sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9) for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[9]: continue pesr_id, depth_id = pair.fields[3], pair.fields[8] # Skip depth records we already added with 80% reciprocal if depth_id in filtered_depth_IDs: continue # If sample is in both depth record and pe/sr record, remove it from # depth record depth_record = records['depth'][depth_id] pesr_record = records['pesr'][pesr_id] merge_nested_depth_record(pesr_record, depth_record) # Merge records together def _sort_key(record): return (record.chrom, record.pos, record.info['CHR2'], record.stop) pesr_records = sorted(records['pesr'].values(), key=_sort_key) depth_records = sorted(records['depth'].values(), key=_sort_key) for record in heapq.merge(pesr_records, depth_records, key=_sort_key): # Clean out unwanted format keys for key in record.format.keys(): if key != 'GT': del record.format[key] record.info['SOURCES'] = sorted(set(record.info['SOURCES'])) record.info['MEMBERS'] = sorted(set(record.info['MEMBERS'])) # Skip emptied depth records if len(svu.get_called_samples(record)) == 0: continue yield record
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.5, sample_overlap=0.5): # Memory inefficient but it's easier and shouldn't matter too much # now that the variants have been filtered down records = dict() records['pesr'] = {record.id: record for record in pesr_vcf} records['depth'] = {record.id: record for record in depth_vcf} # Wipe MEMBERS from prior clustering for source in 'pesr depth'.split(): for ID, record in records[source].items(): record.info['MEMBERS'] = [ID] # Reset for bedtool creation pesr_vcf.reset() base_record = next(pesr_vcf) # Reset for bedtool creation pesr_vcf.reset() depth_vcf.reset() pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False, include_samples=True, include_strands=False, report_alt=False) depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False, include_samples=True, include_strands=False, report_alt=False) # Remove records with no samples def _filter_allref(feature): "Returns False if feature has no called samples" exclude = False if len(feature.fields) == 6: samples = feature.fields[5] if samples not in ['.', '']: exclude = True return exclude pesr_bed = pesr_bed.filter(_filter_allref).saveas('filtered_pesr.bed') depth_bed = depth_bed.filter(_filter_allref).saveas('filtered_depth.bed') # Merge depth records with PE/SR records if they share 50% recip overlap sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac) filtered_depth_IDs = deque() for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[10]: continue # Get vcf records pesr_id, depth_id = pair.fields[3], pair.fields[9] pesr_record = records['pesr'][pesr_id] depth_record = records['depth'][depth_id] # Check for >=50% sample overlap samp_ovr = svu.samples_overlap(samplesA=pair.fields[5].split(','), samplesB=pair.fields[11].split(',')) if not samp_ovr: continue # Note removal of depth ID filtered_depth_IDs.append(depth_id) # Update metadata and samples pesr_record.info['MEMBERS'] = (pesr_record.info.get('MEMBERS', ()) + (depth_record.id, )) pesr_record.info['ALGORITHMS'] = pesr_record.info['ALGORITHMS'] + ('depth', ) svu.update_best_genotypes(pesr_record, [pesr_record, depth_record], preserve_multiallelic=True) if 'varGQ' in pesr_record.info.keys() and 'varGQ' in depth_record.info.keys(): pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'], depth_record.info['varGQ']) for sample in pesr_record.samples: if 'EV' in pesr_record.samples[sample].keys() and 'EV' in depth_record.info.keys(): pesr_ev = pesr_record.samples[sample]['EV'] depth_ev = depth_record.samples[sample]['EV'] pesr_record.samples[sample]['EV'] = tuple(sorted(set(pesr_ev).union(depth_ev))) # Remove overlapping depth records (not performed in for loop to account # for double overlaps # TODO: handle double overlap of depth calls for ID in set(filtered_depth_IDs): records['depth'].pop(ID) # In remaining depth-only calls, add samples to PE/SR record if the # record covers 90% of the depth-only call. # SFARI ONLY - REMOVED FOR OTHER ANALYSES # sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9) # # for pair in sect.intervals: # # Check SV types match # if pair.fields[4] != pair.fields[10]: # continue # # pesr_id, depth_id = pair.fields[3], pair.fields[9] # # # Skip depth records we already added with 50% reciprocal # if depth_id in filtered_depth_IDs: # continue # # # If sample is in both depth record and pe/sr record, remove it from # # depth record # depth_record = records['depth'][depth_id] # pesr_record = records['pesr'][pesr_id] # # merge_nested_depth_record(pesr_record, depth_record) # Merge records together def _sort_key(record): return (record.chrom, record.pos, record.info['CHR2'], record.stop) pesr_records = sorted(records['pesr'].values(), key=_sort_key) depth_records = sorted(records['depth'].values(), key=_sort_key) depth_records = [clean_depth_record(base_record, r) for r in depth_records] for record in heapq.merge(pesr_records, depth_records, key=_sort_key): # Clean out unwanted format keys # EDIT - this should be handled upstream by add_genotypes # FORMATS = 'GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split() # for key in record.format.keys(): # if key not in FORMATS: # del record.format[key] record.info['ALGORITHMS'] = sorted(set(record.info['ALGORITHMS'])) record.info['MEMBERS'] = sorted(set(record.info.get('MEMBERS', ()))) # Skip emptied depth records if len(svu.get_called_samples(record)) == 0: continue yield record