def find_homologous_regions(seq, genomesource, segments, window_size=500, offset=500): homologous_regions = [] for i in range(0, max(1, len(seq)-window_size), offset): # print("---", i, "---") curseq = seq[i:i+window_size] cur_alns = genomesource.bwa.align(curseq, secondary_hit_cutoff=0.5) if len(cur_alns) > 0: print("BEST:", cur_alns[0]) for i, cur_aln in enumerate(cur_alns): chrom = genomesource.bwa.ChrIDToName(cur_aln.reference_id) locus = intervals.Locus(chrom, cur_aln.reference_start, cur_aln.reference_end, "+") if not intervals.overlaps(locus, segments): homologous_regions.append(locus) # print(cur_aln) # print() clustered = cluster_loci(homologous_regions) # for l in clustered: # print(l) return clustered
def blacklist(self, blacklist_loci): self._blacklist = [] for locus in blacklist_loci: cur_chrom = misc.match_chrom_format(locus.chrom, list(self.keys())) self._blacklist.append( intervals.Locus(cur_chrom, locus.start, locus.end, locus.strand))
def _get_pair_locus(aln1, aln2): assert aln1.reference_name == aln2.reference_name chrom = aln1.reference_name start = min(aln1.reference_start, aln2.reference_start) end = max(aln1.reference_end, aln2.reference_end) locus = intervals.Locus(chrom, start, end, "+") return locus
def loci(self): if self.aln1.reference_id == self.aln2.reference_id: chrom = self.aln1.chrom start = min(self.aln1.reference_start, self.aln2.reference_start) end = max(self.aln1.reference_end, self.aln2.reference_end) locus = intervals.Locus(chrom, start, end, "+") return [locus] else: return [self.aln1.locus, self.aln2.locus]
def get_internal_segments(sv, extend=20): chrom_parts = sv.chrom_parts("ref") internal_segments = [] for part in chrom_parts: for i, segment in enumerate(part.segments): if i == len(part.segments) - 1: internal_segment = intervals.Locus(segment.chrom, segment.start - extend, segment.start + extend, "+") internal_segments.append(internal_segment) elif i == 0: internal_segment = intervals.Locus(segment.chrom, segment.end - extend, segment.end + extend, "+") internal_segments.append(internal_segment) else: internal_segment = intervals.Locus(segment.chrom, segment.start - extend, segment.end + extend, "+") internal_segments.append(internal_segment) return internal_segments
def do_homology_search(datahub): logger.info( "Finding homologous genomic regions (segmental duplications)...") variant = datahub.variant segment_loci = [] for part in variant.chrom_parts("ref"): for segment in part.segments: cur_chrom = misc.match_chrom_format(segment.chrom, datahub.genome.keys()) segment = intervals.Locus(cur_chrom, max(0, segment.start - 100), segment.end + 100, "+") segment_loci.append(segment) for part in variant.chrom_parts("ref"): breakpoints = numpy.cumsum([len(segment) for segment in part.segments])[:-1] seq = part.get_seq() homologous_regions = find_homologous_regions(seq, datahub.genome, segment_loci) plot_homologous_regions(seq, homologous_regions, datahub.genome, part.id, breakpoints) # also look for any reasonably-sized segments that are unique to the alt allele ref_ids = set(segment.id for part in variant.chrom_parts("ref") for segment in part.segments) alt_ids = set(segment.id for part in variant.chrom_parts("alt") for segment in part.segments) alt_only_ids = alt_ids - ref_ids alt_only_segments = [ segment for part in variant.chrom_parts("alt") for segment in part.segments if segment.id in alt_only_ids ] for segment in alt_only_segments: if len(segment) >= 50: seq = datahub.variant.sources[segment.chrom].get_seq( segment.chrom, segment.start, segment.end, segment.strand) homologous_regions = find_homologous_regions( seq, datahub.genome, []) plot_homologous_regions(seq, homologous_regions, datahub.genome, label="segment_{}".format(segment))
def locus(self): chrom = self.chrom if chrom is None: chrom = self._read.reference_name start = self._read.reference_start end = self._read.reference_end # if self.cigartuples[0][0] == 4: # start += self.cigartuples[0][1] # if self.cigartuples[-1][0] == 4: # end -= self.cigartuples[-1][1] locus = intervals.Locus(chrom, start, end, "-" if self.is_reverse else "+") return locus
def set_read_supports_allele(aln_set, aln, allele, score, read_stats, breakpoint_collection, min_overlap): if not aln.concordant(read_stats): return 0 assert len(aln.loci) == 1 aln_locus = aln.loci[0] try: chrom = aln.aln1.chrom start = max(aln.aln1.reference_start, aln.aln2.reference_start) end = min(aln.aln1.reference_end, aln.aln2.reference_end) unsequenced_insert_locus = intervals.Locus(chrom, start, end, "+") except AttributeError: unsequenced_insert_locus = None try: if aln.insert_size > read_stats.max_reasonable_insert_size(): return 0 if aln.insert_size < read_stats.min_reasonable_insert_size(): return 0 except (IndexError, AttributeError): pass overlaps = get_overlaps(aln_locus, unsequenced_insert_locus, breakpoint_collection) if len(overlaps) == 0: return 0 best_overlap = max(list(zip(*overlaps.values()))[1]) aln_set.supports_allele = allele aln_set.support_prob = score / 40.0 #(1 - mapq.phred_to_prob(score, 10.0)) aln_set.supporting_aln = aln aln.set_tag("OV", best_overlap) aln.set_tag("Ov", json.dumps(overlaps)) aln.overlap = best_overlap return aln_set.support_prob