예제 #1
0
def get_reads_unpaired(sample, datahub):
    logger.info("Loading more reads...")
    cur_reads = []
    search_regions = datahub.variant.search_regions(sample.search_distance)

    for region in search_regions:
        chrom, start, end = region.chrom, region.start, region.end
        chrom = misc.match_chrom_format(chrom, sample.bam.references)
        for read in sample.bam.fetch(chrom, start, end):
            # if read.query_name != "m150105_192231_42177R_c100761782550000001823161607221526_s1_p0/138972/39862_46995":
            #     continue

            if read.is_supplementary or read.is_duplicate or read.is_secondary:
                continue

            if datahub.args.min_mapq and read.mapq < datahub.args.min_mapq:
                continue

            cur_reads.append(alignment.Alignment(read))
            if datahub.args.batch_size is not None and len(
                    cur_reads) >= datahub.args.batch_size:
                yield cur_reads
                logger.info("Loading more reads...")
                cur_reads = []

    yield cur_reads
예제 #2
0
    def get_seq(self, chrom, start, end, strand):
        chrom = misc.match_chrom_format(chrom, list(self.fasta.keys()))

        seq = self.fasta[chrom][start:end + 1]
        if strand == "-":
            seq = misc.reverse_comp(seq)
        return seq
예제 #3
0
    def blacklist(self, blacklist_loci):
        self._blacklist = []

        for locus in blacklist_loci:
            cur_chrom = misc.match_chrom_format(locus.chrom, list(self.keys()))
            self._blacklist.append(
                intervals.Locus(cur_chrom, locus.start, locus.end,
                                locus.strand))
예제 #4
0
def do_homology_search(datahub):
    logger.info(
        "Finding homologous genomic regions (segmental duplications)...")

    variant = datahub.variant

    segment_loci = []
    for part in variant.chrom_parts("ref"):
        for segment in part.segments:
            cur_chrom = misc.match_chrom_format(segment.chrom,
                                                datahub.genome.keys())
            segment = intervals.Locus(cur_chrom, max(0, segment.start - 100),
                                      segment.end + 100, "+")
            segment_loci.append(segment)

    for part in variant.chrom_parts("ref"):
        breakpoints = numpy.cumsum([len(segment)
                                    for segment in part.segments])[:-1]

        seq = part.get_seq()
        homologous_regions = find_homologous_regions(seq, datahub.genome,
                                                     segment_loci)
        plot_homologous_regions(seq, homologous_regions, datahub.genome,
                                part.id, breakpoints)

    # also look for any reasonably-sized segments that are unique to the alt allele
    ref_ids = set(segment.id for part in variant.chrom_parts("ref")
                  for segment in part.segments)
    alt_ids = set(segment.id for part in variant.chrom_parts("alt")
                  for segment in part.segments)
    alt_only_ids = alt_ids - ref_ids
    alt_only_segments = [
        segment for part in variant.chrom_parts("alt")
        for segment in part.segments if segment.id in alt_only_ids
    ]

    for segment in alt_only_segments:
        if len(segment) >= 50:
            seq = datahub.variant.sources[segment.chrom].get_seq(
                segment.chrom, segment.start, segment.end, segment.strand)

            homologous_regions = find_homologous_regions(
                seq, datahub.genome, [])
            plot_homologous_regions(seq,
                                    homologous_regions,
                                    datahub.genome,
                                    label="segment_{}".format(segment))