Exemplo n.º 1
0
def find_homologous_regions(seq, genomesource, segments, window_size=500, offset=500):
    homologous_regions = []

    for i in range(0, max(1, len(seq)-window_size), offset):
        # print("---", i, "---")

        curseq = seq[i:i+window_size]

        cur_alns = genomesource.bwa.align(curseq, secondary_hit_cutoff=0.5)

        if len(cur_alns) > 0: print("BEST:", cur_alns[0])

        for i, cur_aln in enumerate(cur_alns):
            chrom = genomesource.bwa.ChrIDToName(cur_aln.reference_id)
            locus = intervals.Locus(chrom, cur_aln.reference_start, cur_aln.reference_end, "+")

            if not intervals.overlaps(locus, segments):
                homologous_regions.append(locus)

            # print(cur_aln)
        # print()

    clustered = cluster_loci(homologous_regions)
    # for l in clustered:
        # print(l)

    return clustered
Exemplo n.º 2
0
    def blacklist(self, blacklist_loci):
        self._blacklist = []

        for locus in blacklist_loci:
            cur_chrom = misc.match_chrom_format(locus.chrom, list(self.keys()))
            self._blacklist.append(
                intervals.Locus(cur_chrom, locus.start, locus.end,
                                locus.strand))
Exemplo n.º 3
0
def _get_pair_locus(aln1, aln2):
    assert aln1.reference_name == aln2.reference_name
    chrom = aln1.reference_name
    start = min(aln1.reference_start, aln2.reference_start)
    end = max(aln1.reference_end, aln2.reference_end)
    locus = intervals.Locus(chrom, start, end, "+")

    return locus
Exemplo n.º 4
0
    def loci(self):
        if self.aln1.reference_id == self.aln2.reference_id:
            chrom = self.aln1.chrom
            start = min(self.aln1.reference_start, self.aln2.reference_start)
            end = max(self.aln1.reference_end, self.aln2.reference_end)
            locus = intervals.Locus(chrom, start, end, "+")

            return [locus]
        else:
            return [self.aln1.locus, self.aln2.locus]
Exemplo n.º 5
0
def get_internal_segments(sv, extend=20):
    chrom_parts = sv.chrom_parts("ref")

    internal_segments = []

    for part in chrom_parts:
        for i, segment in enumerate(part.segments):
            if i == len(part.segments) - 1:
                internal_segment = intervals.Locus(segment.chrom,
                                                   segment.start - extend,
                                                   segment.start + extend, "+")
                internal_segments.append(internal_segment)
            elif i == 0:
                internal_segment = intervals.Locus(segment.chrom,
                                                   segment.end - extend,
                                                   segment.end + extend, "+")
                internal_segments.append(internal_segment)
            else:
                internal_segment = intervals.Locus(segment.chrom,
                                                   segment.start - extend,
                                                   segment.end + extend, "+")
                internal_segments.append(internal_segment)

    return internal_segments
Exemplo n.º 6
0
def do_homology_search(datahub):
    logger.info(
        "Finding homologous genomic regions (segmental duplications)...")

    variant = datahub.variant

    segment_loci = []
    for part in variant.chrom_parts("ref"):
        for segment in part.segments:
            cur_chrom = misc.match_chrom_format(segment.chrom,
                                                datahub.genome.keys())
            segment = intervals.Locus(cur_chrom, max(0, segment.start - 100),
                                      segment.end + 100, "+")
            segment_loci.append(segment)

    for part in variant.chrom_parts("ref"):
        breakpoints = numpy.cumsum([len(segment)
                                    for segment in part.segments])[:-1]

        seq = part.get_seq()
        homologous_regions = find_homologous_regions(seq, datahub.genome,
                                                     segment_loci)
        plot_homologous_regions(seq, homologous_regions, datahub.genome,
                                part.id, breakpoints)

    # also look for any reasonably-sized segments that are unique to the alt allele
    ref_ids = set(segment.id for part in variant.chrom_parts("ref")
                  for segment in part.segments)
    alt_ids = set(segment.id for part in variant.chrom_parts("alt")
                  for segment in part.segments)
    alt_only_ids = alt_ids - ref_ids
    alt_only_segments = [
        segment for part in variant.chrom_parts("alt")
        for segment in part.segments if segment.id in alt_only_ids
    ]

    for segment in alt_only_segments:
        if len(segment) >= 50:
            seq = datahub.variant.sources[segment.chrom].get_seq(
                segment.chrom, segment.start, segment.end, segment.strand)

            homologous_regions = find_homologous_regions(
                seq, datahub.genome, [])
            plot_homologous_regions(seq,
                                    homologous_regions,
                                    datahub.genome,
                                    label="segment_{}".format(segment))
Exemplo n.º 7
0
    def locus(self):
        chrom = self.chrom
        if chrom is None:
            chrom = self._read.reference_name

        start = self._read.reference_start
        end = self._read.reference_end

        # if self.cigartuples[0][0] == 4:
        #     start += self.cigartuples[0][1]
        # if self.cigartuples[-1][0] == 4:
        #     end -= self.cigartuples[-1][1]

        locus = intervals.Locus(chrom, start, end,
                                "-" if self.is_reverse else "+")

        return locus
Exemplo n.º 8
0
def set_read_supports_allele(aln_set, aln, allele, score, read_stats,
                             breakpoint_collection, min_overlap):
    if not aln.concordant(read_stats):
        return 0

    assert len(aln.loci) == 1
    aln_locus = aln.loci[0]
    try:
        chrom = aln.aln1.chrom
        start = max(aln.aln1.reference_start, aln.aln2.reference_start)
        end = min(aln.aln1.reference_end, aln.aln2.reference_end)

        unsequenced_insert_locus = intervals.Locus(chrom, start, end, "+")
    except AttributeError:
        unsequenced_insert_locus = None

    try:
        if aln.insert_size > read_stats.max_reasonable_insert_size():
            return 0
        if aln.insert_size < read_stats.min_reasonable_insert_size():
            return 0
    except (IndexError, AttributeError):
        pass

    overlaps = get_overlaps(aln_locus, unsequenced_insert_locus,
                            breakpoint_collection)
    if len(overlaps) == 0:
        return 0

    best_overlap = max(list(zip(*overlaps.values()))[1])

    aln_set.supports_allele = allele
    aln_set.support_prob = score / 40.0  #(1 - mapq.phred_to_prob(score, 10.0))
    aln_set.supporting_aln = aln

    aln.set_tag("OV", best_overlap)
    aln.set_tag("Ov", json.dumps(overlaps))
    aln.overlap = best_overlap

    return aln_set.support_prob