def get_reads_unpaired(sample, datahub): logger.info("Loading more reads...") cur_reads = [] search_regions = datahub.variant.search_regions(sample.search_distance) for region in search_regions: chrom, start, end = region.chrom, region.start, region.end chrom = misc.match_chrom_format(chrom, sample.bam.references) for read in sample.bam.fetch(chrom, start, end): # if read.query_name != "m150105_192231_42177R_c100761782550000001823161607221526_s1_p0/138972/39862_46995": # continue if read.is_supplementary or read.is_duplicate or read.is_secondary: continue if datahub.args.min_mapq and read.mapq < datahub.args.min_mapq: continue cur_reads.append(alignment.Alignment(read)) if datahub.args.batch_size is not None and len( cur_reads) >= datahub.args.batch_size: yield cur_reads logger.info("Loading more reads...") cur_reads = [] yield cur_reads
def get_seq(self, chrom, start, end, strand): chrom = misc.match_chrom_format(chrom, list(self.fasta.keys())) seq = self.fasta[chrom][start:end + 1] if strand == "-": seq = misc.reverse_comp(seq) return seq
def blacklist(self, blacklist_loci): self._blacklist = [] for locus in blacklist_loci: cur_chrom = misc.match_chrom_format(locus.chrom, list(self.keys())) self._blacklist.append( intervals.Locus(cur_chrom, locus.start, locus.end, locus.strand))
def do_homology_search(datahub): logger.info( "Finding homologous genomic regions (segmental duplications)...") variant = datahub.variant segment_loci = [] for part in variant.chrom_parts("ref"): for segment in part.segments: cur_chrom = misc.match_chrom_format(segment.chrom, datahub.genome.keys()) segment = intervals.Locus(cur_chrom, max(0, segment.start - 100), segment.end + 100, "+") segment_loci.append(segment) for part in variant.chrom_parts("ref"): breakpoints = numpy.cumsum([len(segment) for segment in part.segments])[:-1] seq = part.get_seq() homologous_regions = find_homologous_regions(seq, datahub.genome, segment_loci) plot_homologous_regions(seq, homologous_regions, datahub.genome, part.id, breakpoints) # also look for any reasonably-sized segments that are unique to the alt allele ref_ids = set(segment.id for part in variant.chrom_parts("ref") for segment in part.segments) alt_ids = set(segment.id for part in variant.chrom_parts("alt") for segment in part.segments) alt_only_ids = alt_ids - ref_ids alt_only_segments = [ segment for part in variant.chrom_parts("alt") for segment in part.segments if segment.id in alt_only_ids ] for segment in alt_only_segments: if len(segment) >= 50: seq = datahub.variant.sources[segment.chrom].get_seq( segment.chrom, segment.start, segment.end, segment.strand) homologous_regions = find_homologous_regions( seq, datahub.genome, []) plot_homologous_regions(seq, homologous_regions, datahub.genome, label="segment_{}".format(segment))