def convert_region(region, pflank): top = (region.end + region.start) // 2 return Interval(region.chrom, top - pflank, top + pflank + 1, region.name, region.score, region.strand)
default=0, type=int, help="Score threshold for the binding peaks") args = parser.parse_args() trackopts = "track name=\"%s\" description=\"%s\" visibility=1 color=0,60,120 useScore=1" % ( args.name, args.description) print(trackopts) regions = BedTool(args.path) regions = BedTool([x for x in regions if float(x.score) > args.minscore]) score_range = [166, 277, 388, 499, 611, 722, 833, 945, 1000] percentile_range = list(np.linspace(0, 100, len(score_range) + 1)) scores = [float(x.score) for x in regions] thresholds = np.percentile(scores, percentile_range) thresholds[-1] *= 1.01 updated_regions = [] for region in regions: for ucsc_score, t1, t2 in zip(score_range, thresholds, thresholds[1:]): if (t1 <= float(region.score) < t2): updated_regions.append( Interval('chr1', region.start, region.end, region.name, str(ucsc_score - 2), region.strand)) #print([float(x.score) for x in updated_regions]) for interval in updated_regions: sys.stdout.write(str(interval))
ylabel, va='center', rotation='vertical', fontsize='xx-large') plt.savefig(path, format=args.format) #plt.show() plt.close() ########################################################################################################################## #regions = BedTool([Interval(x.chrom, max(0, x.start - args.flen), x.end + args.flen, name=x.name, score=x.attrs['zscores'] , strand=x.strand) for x in regions]) regions = BedTool([ Interval(x.chrom, max(0, x.start - args.flen), x.end + args.flen, name=x.name, score=x.attrs['zscores'], strand=x.strand) for x in regions ]) annotation = BedTool(args.annotation) if (not args.custom): annotation = [x for x in annotation if x[2] in ['gene', 'pseudogene']] rawannotated = regions.intersect(annotation, wo=True) annotated = defaultdict(list) def get_annotation(intersection, offset): return max(intersection.start, int(intersection[offset + 3])), min( intersection.end, int(intersection[offset + 4])), dict([
current_drop.append(int(a[1])) gcdrops.append(tuple(current_drop)) current_drop = [] #if(a[3] == 'min'): #minima[int(a[1])] = float(a[2]); #else: #maxima[int(a[1])] = float(a[2]); gcdrops_intervals = [] for c, (start, end) in enumerate(gcdrops, start=1): score = min(gc_content[start:end]) if (score <= 0.35): gcdrops_intervals.append( Interval('chr1', start, end, name="drop%d" % c, score="%1.5f" % score, strand='+')) gcdrops_intervals = BedTool(gcdrops_intervals) print(len(gcdrops_intervals)) gcdrops_regions = gcdrops_intervals.intersect(regions, c=True) #for interval in gcdrops_regions: #print(interval) #gcmeans = [np.mean(x) for x in sliding_window(gc_content, 20)] #print(heapq.nsmallest(20, gcmeans)) #print(heapq.nsmallest(20, minima))
def consensus( bed: Iterable[BedTool], weights: [int], threshold: float, merge: Callable[[Iterable[Interval], Interval], Interval] = _default_merge ) -> BedTool: """ :param bed: Original bed files :param weights: vote's weights, pass ones if you are not sure :param threshold: How many votes do we need to consider region a conservative one? :param merge: Function to transfer meta information from voted intervals -> new interval :return: """ # 1. Create intervals that have the same number of full hits with respect to the original bed files. # 2. Threshold regions by the number of hits # |▓▓▓| |▓▓▓▓▓▓▓▓▓| # |▓▓| |▓▓▓▓▓▓▓▓▓| # |▓| |▓▓▓▓▓▓▓▓▓| # ----------------------- # |▓|▓| |▓▓|▓▓|▓▓▓|▓▓|▓▓| # |3|2| |1 |2 |3 |2 |1 | result = [] # In parallel loop over bed intervals # At each step select subinterval and count hits iterators = [iter(b.sort()) for b in bed] assert len(weights) == len(iterators), "len(regions) != len(weights)" intervals = [(ind, next(b_iter)) for ind, b_iter in enumerate(iterators)] while intervals: boundaries = sorted( set([(inter.chrom, inter.start) for _, inter in intervals] + [(inter.chrom, inter.end) for _, inter in intervals])) # [(chr, boundary]), ...] schrom, start = boundaries[0] echrom, end = boundaries[1] assert schrom == echrom hits = [] for ind, inter in intervals: if inter.chrom == schrom and inter.start <= start and end <= inter.end: hits.append(weights[ind]) assert len(hits) > 0 if sum(hits) >= threshold: consolidated = Interval(schrom, start, end) consolidated = merge(hits, consolidated) result.append(consolidated) # Push interval forward new_intervals = [] for ind, inter in intervals: if inter.chrom != schrom: new_intervals.append((ind, inter)) continue assert inter.start >= start if start == inter.start: assert end <= inter.end inter.start = end # request next interval if inter.start == inter.end: try: inter = next(iterators[ind]) new_intervals.append((ind, inter)) except StopIteration: continue else: new_intervals.append((ind, inter)) intervals = new_intervals return BedTool(result).sort().merge()
def get_flanks(include_file, seeds, up_distance, down_distance): # .intervals returns the tree include = include_file.intervals def bad_cov(intervals): coverage = set() for i in intervals: coverage.update(range(i.start, i.end)) return len(coverage) fmt = "{s.chrom}\t{s.start}\t{s.end}\t{t}\t{left}\t{right}\t{n_intervals}\n" for seed in seeds: seed_hits = include.all_hits(seed) # and either continue or yield None? if seed_hits == []: continue seed_hits = sorted(seed_hits, key=attrgetter('start')) if seed.strand == "-": region_left = Interval(seed.chrom, max(0, seed.start - down_distance), seed.start) region_right = Interval(seed.chrom, seed.end, seed.end + up_distance) else: region_left = Interval(seed.chrom, max(0, seed.start - up_distance), seed.start) region_right = Interval(seed.chrom, seed.end, seed.end + down_distance) # this is needed, unfortunately region_right.file_type = region_left.file_type = "bed" # this won't handle overlapping include intervals... include_left = sorted(include.all_hits(region_left), key=attrgetter('start')) include_right = sorted(include.all_hits(region_right), key=attrgetter('start')) #assert bad_cov(include_right) == sum(i.length for i in include_right) #assert bad_cov(include_left) == sum(i.length for i in include_left) if include_left: # truncate to not include the seed point so we get unique for left, right # and add in the seed at the end include_left[-1].end = min(include_left[-1].end, seed.start) # adjust end-point so we get exactly pad include_left[0].start = max(include_left[0].start, region_left.start) if include_right: # truncate to region include_right[0].start = max(include_right[0].start, seed.end) include_right[-1].end = min(include_right[-1].end, region_right.end) #assert bad_cov(include_right) == sum(i.length for i in include_right) #assert bad_cov(include_left) == sum(i.length for i in include_left) assert include_left == [] or include_left[-1].end <= seed.start assert include_right == [] or seed.end <= include_right[0].start #null_bases = total_bases - l #assert bad_cov(include_left) == sum(i.length for i in include_left) #assert bad_cov(include_right) == sum(i.length for i in include_right) #assert bad_cov(seed_hits) == sum(i.length for i in seed_hits) # truncate seed_hits to actual seed region seed_hits[0].start = max(seed_hits[0].start, seed.start) seed_hits[-1].end = min(seed_hits[-1].end, seed.end) #assert bad_cov(seed_hits) == sum(i.length for i in seed_hits) flanks = include_left + include_right total_bases = sum(i.length for i in flanks) + sum(i.length for i in seed_hits) #coverage = bad_cov(flanks) #assert coverage == total_bases yield flanks, seed_hits, total_bases, seed
def testRichCmp(self): # be obsessive . . . # # == a = Interval("chr21", 100, 200) b = Interval("chr21", 100, 200) self.assert_(a == b) self.assertFalse(a != b) self.assert_(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) a = Interval("chr21", 100, 100) b = Interval("chr21", 100, 100) self.assert_(a == b) self.assertFalse(a != b) self.assert_(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # != because of strand a = Interval("chr21", 100, 200, strand='+') b = Interval("chr21", 100, 200, strand='-') self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a >= b a = Interval("chr21", 100, 300) b = Interval("chr21", 100, 200) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a <= b a = Interval("chr21", 100, 300) b = Interval("chr21", 300, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a <= b a = Interval("chr21", 100, 300) b = Interval("chr21", 250, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a < b a = Interval("chr21", 100, 200) b = Interval("chr21", 201, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assert_(a < b) self.assertFalse(a > b) # a > b a = Interval("chr21", 201, 300) b = Interval("chr21", 100, 200) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assert_(a > b) # a != b a = Interval("none", 1, 100) b = Interval("chr21", 1, 100) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # nested should raise NotImplementedError a = Interval("chr21", 100, 200) b = Interval("chr21", 50, 300) self.assertRaises(NotImplementedError, a.__eq__, b) self.assertRaises(NotImplementedError, a.__ne__, b) self.assertRaises(NotImplementedError, a.__le__, b) self.assertRaises(NotImplementedError, a.__ge__, b) self.assertRaises(NotImplementedError, a.__lt__, b) self.assertRaises(NotImplementedError, a.__gt__, b)
def __getitem__(self, idx): from pybedtools import Interval if self.fasta_extractor is None: # first call # Use normal fasta/bigwig extractors self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True) self.bw_extractors = { task: [BigwigExtractor(track) for track in task_spec.tracks] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [BigwigExtractor(track) for track in task_spec.tracks] for task, task_spec in self.ds.bias_specs.items() } # Get the genomic interval for that particular datapoint interval = Interval( self.dfm.iat[idx, 0], # chrom self.dfm.iat[idx, 1], # start self.dfm.iat[idx, 2]) # end # Transform the input interval (for say augmentation...) if self.interval_transformer is not None: interval = self.interval_transformer(interval) # resize the intervals to the desired widths target_interval = resize_interval(deepcopy(interval), self.peak_width) seq_interval = resize_interval(deepcopy(interval), self.seq_width) # This only kicks in when we specify the taskname from dataspec # to the 3rd column. E.g. it doesn't apply when using intervals_file interval_from_task = self.dfm.iat[ idx, 3] if self.intervals_file is None else '' # extract DNA sequence + one-hot encode it sequence = self.fasta_extractor([seq_interval])[0] inputs = {"seq": sequence} # exctract the profile counts from the bigwigs cuts = { f"{task}/profile": _run_extractors(self.bw_extractors[task], [target_interval], sum_tracks=spec.sum_tracks)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks } if self.track_transform is not None: for task in self.tasks: cuts[f'{task}/profile'] = self.track_transform( cuts[f'{task}/profile']) # Add binary thing for i, task in enumerate(self.tasks): #print("active", self.dfm.iat[idx, (3+i)]) cuts[f'{task}/activity'] = self.dfm.iat[idx, (4 + i)] # Add total number of counts for task in self.tasks: cuts[f'{task}/counts'] = self.total_count_transform( cuts[f'{task}/profile'].sum(0)) if len(self.ds.bias_specs) > 0: # Extract the bias tracks biases = { bias_task: _run_extractors(self.bias_bw_extractors[bias_task], [target_interval], sum_tracks=spec.sum_tracks)[0] for bias_task, spec in self.ds.bias_specs.items() } task_biases = { f"bias/{task}/profile": np.concatenate( [biases[bt] for bt in self.task_bias_tracks[task]], axis=-1) for task in self.tasks } if self.track_transform is not None: for task in self.tasks: task_biases[f'bias/{task}/profile'] = self.track_transform( task_biases[f'bias/{task}/profile']) # Add total number of bias counts for task in self.tasks: task_biases[ f'bias/{task}/counts'] = self.total_count_transform( task_biases[f'bias/{task}/profile'].sum(0)) inputs = {**inputs, **task_biases} if self.include_classes: # Optionally, add binary labels from the additional columns in the tsv intervals file classes = { f"{task}/class": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks } cuts = {**cuts, **classes} out = {"inputs": inputs, "targets": cuts} if self.include_metadata: # remember the metadata (what genomic interval was used) out['metadata'] = { "range": GenomicRanges( chr=target_interval.chrom, start=target_interval.start, end=target_interval.stop, id=idx, strand=(target_interval.strand if target_interval.strand is not None else "*"), ), "interval_from_task": interval_from_task } return out
def testOverlaps(self): i = Interval("chr21", 9719768, 9739768) hits = self.bed.all_hits(i) self.assertEqual(len(hits), 8) for hit in hits: self.assert_(hit.start <= 9739768 and hit.end >= 9719768)
def setUp(self): self.file = os.path.join(PATH, self.file) start, end, strand = 1, 100, "+" self.i = Interval("chr1", start, end, strand=strand) self.start, self.end, self.strand = start, end, strand
def _fetch(self, interval, istart, iend): seq = self.fasta.extract(Interval(interval.chrom, istart, iend)) seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend) return seq
def __init__(self, data, reverse_complement_bool=False, contig=None, strand=None): """ Constructor :param data: genome of a being to be researched :type data: GFGenome or pyensembl.Genome object :param reverse_complement_bool: True if the reverse_complement of 5'UTR sequence for "-" strand is required :type reverse_complement_bool: bool :param contig: optional, number of the chromosome without 'chr' :type contig: str :param strand: optional, chromosome strand :type strand: char '+' or '-' :return: initializes the following attributes: - seq[] – list of 5' UTR sequences with exons and introns - seq_exons[] – list of 5' UTR sequences with only exons, NOTE it gives exons corresponding to the 5' UTR, therefore the last corresponding exon gets cropped at the start_codon_positions[0] - intervals[] – list of 5' UTR intervals - transcripts{} – dictionary: key – transcript id, value – index of the corresponding 5' UTR - exons{} – dictionary: key - 5' UTR index, value - list of tuples (exon sequence, exon interval) """ # NOTE 2 5' UTR are considered to be equal only if both their seq and seq_exons are equal self.seq_exons = [] self.intervals = [] self.transcripts = {} self.exons = {} count = 0 if strand is None: for transcript in data.transcripts(contig, '+'): if transcript.contains_start_codon: temp_exon_list = [] start = transcript.start start_pos = 0 for exon in transcript.exons: if transcript.start_codon_positions[0] >= exon.start >= start: if exon.end > transcript.start_codon_positions[0]: temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos: (start_pos + transcript.start_codon_positions[ 0] - exon.start)], Interval(transcript.contig, exon.start, transcript.start_codon_positions[0] - 1, exon.id, 0, "+"))) # dummy score else: temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos: ( start_pos + exon.end - exon.start + 1)], Interval(transcript.contig, exon.start, exon.end, exon.id, 0, "+"))) # dummy score start_pos = start_pos + exon.end - exon.start + 1 start = exon.start # apparently 2 5'UTRs can have the same exonic sequences but different exonic+intronic sequences if transcript.five_prime_utr_sequence not in self.seq_exons: self.seq_exons.append(transcript.five_prime_utr_sequence) self.intervals.append( Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "+")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: pos = self.seq_exons.index(transcript.five_prime_utr_sequence) if (self.intervals[pos].strand == "+") and \ ((self.intervals[pos]).start != (transcript.exons[0]).start): self.seq_exons.append(transcript.five_prime_utr_sequence) self.intervals.append( Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "+")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: self.transcripts[transcript.id] = self.seq_exons.index( transcript.five_prime_utr_sequence) for transcript in data.transcripts(contig, '-'): if transcript.contains_start_codon: temp_exon_list = [] end = transcript.end temp_reverse_seq = reverse_complement(transcript.five_prime_utr_sequence) start_pos = len(temp_reverse_seq) for exon in transcript.exons: if transcript.start_codon_positions[2] <= exon.end <= end: if exon.start < transcript.start_codon_positions[2]: temp_exon_list.append((temp_reverse_seq[:start_pos], Interval(transcript.contig, transcript.start_codon_positions[2] + 1, exon.end, exon.id, 0, "-"))) # dummy score else: temp_exon_list.append( (temp_reverse_seq[start_pos - (exon.end - exon.start + 1):start_pos], Interval(transcript.contig, exon.start, exon.end, exon.id, 0, "-"))) # dummy score start_pos = start_pos - (exon.end - exon.start) - 1 end = exon.end if reverse_complement_bool: temp_exon_list_reverse = [] for temp_exon in temp_exon_list: temp_exon_list_reverse.append((reverse_complement(temp_exon[0]), temp_exon[1])) temp_exon_list = temp_exon_list_reverse current_transcript_seq = transcript.five_prime_utr_sequence else: current_transcript_seq = reverse_complement(transcript.five_prime_utr_sequence) if current_transcript_seq not in self.seq_exons: self.seq_exons.append(current_transcript_seq) self.intervals.append(Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "-")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: pos = self.seq_exons.index(current_transcript_seq) if (self.intervals[pos].strand == "-") and \ (self.intervals[pos].start != transcript.exons[len(transcript.exons) - 1].end): self.seq_exons.append(current_transcript_seq) self.intervals.append( Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "-")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: self.transcripts[transcript.id] = self.seq_exons.index(current_transcript_seq) else: if strand == "+": for transcript in data.transcripts(contig, '+'): if transcript.contains_start_codon: temp_exon_list = [] start = transcript.start start_pos = 0 for exon in transcript.exons: if transcript.start_codon_positions[0] >= exon.start >= start: if exon.end > transcript.start_codon_positions[0]: temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos: (start_pos + transcript.start_codon_positions[ 0] - exon.start)], Interval(transcript.contig, exon.start, transcript.start_codon_positions[0] - 1, exon.id, 0, "+"))) # dummy score else: temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos: ( start_pos + exon.end - exon.start + 1)], Interval(transcript.contig, exon.start, exon.end, exon.id, 0, "+"))) # dummy score start_pos = start_pos + exon.end - exon.start + 1 start = exon.start # apparently 2 5'UTRs can have the same exonic sequences but different exonic+intronic sequences if transcript.five_prime_utr_sequence not in self.seq_exons: self.seq_exons.append(transcript.five_prime_utr_sequence) self.intervals.append( Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "+")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: pos = self.seq_exons.index(transcript.five_prime_utr_sequence) if (self.intervals[pos].strand == "+") and \ ((self.intervals[pos]).start != (transcript.exons[0]).start): self.seq_exons.append(transcript.five_prime_utr_sequence) self.intervals.append( Interval(transcript.contig, transcript.exons[0].start, transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0, "+")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: self.transcripts[transcript.id] = self.seq_exons.index( transcript.five_prime_utr_sequence) else: for transcript in data.transcripts(contig, '-'): if transcript.contains_start_codon: temp_exon_list = [] end = transcript.end temp_reverse_seq = reverse_complement(transcript.five_prime_utr_sequence) start_pos = len(temp_reverse_seq) for exon in transcript.exons: if transcript.start_codon_positions[2] <= exon.end <= end: if exon.start < transcript.start_codon_positions[2]: temp_exon_list.append((temp_reverse_seq[:start_pos], Interval(transcript.contig, transcript.start_codon_positions[2] + 1, exon.end, exon.id, 0, "-"))) # dummy score else: temp_exon_list.append( (temp_reverse_seq[start_pos - (exon.end - exon.start + 1):start_pos], Interval(transcript.contig, exon.start, exon.end, exon.id, 0, "-"))) # dummy score start_pos = start_pos - (exon.end - exon.start) - 1 end = exon.end if reverse_complement_bool: temp_exon_list_reverse = [] for temp_exon in temp_exon_list: temp_exon_list_reverse.append((reverse_complement(temp_exon[0]), temp_exon[1])) temp_exon_list = temp_exon_list_reverse current_transcript_seq = transcript.five_prime_utr_sequence else: current_transcript_seq = reverse_complement(transcript.five_prime_utr_sequence) if current_transcript_seq not in self.seq_exons: self.seq_exons.append(current_transcript_seq) self.intervals.append(Interval(transcript.contig, transcript.start_codon_positions[2] + 1, transcript.end, "5' UTR", 0, "-")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: pos = self.seq_exons.index(current_transcript_seq) if (self.intervals[pos].strand == "-") and \ (self.intervals[pos].start != transcript.exons[len(transcript.exons) - 1].end): self.seq_exons.append(current_transcript_seq) self.intervals.append( Interval(transcript.contig, transcript.start_codon_positions[2] + 1, transcript.end, "5' UTR", 0, "-")) # dummy score self.exons[count] = temp_exon_list self.transcripts[transcript.id] = count count = count + 1 else: self.transcripts[transcript.id] = self.seq_exons.index(current_transcript_seq)
def interval(self): return Interval(self.chrom, self.start, self.end, self.out_fname)
def setUpClass(cls): cls.intervals = [ Interval("ref1", 0, 5, "file1"), Interval("ref2", 10, 12, "file2"), ]
def get_nucleobase_mutation_table(self, vcf): """ Get a table which shows whether a certain nucleobase in Kozak sequence or stop codon context was mutated or not. :param vcf: path to the vcf.gz or file opened using cyvcf2 :type vcf: string or an "opened" file :return: pd.DataFrame, column names – K_i – where i shows position in Kozak sequence; S_i – where i shows position in stop codon context; gene_id; transcript_id rows – NaN – no variant, 1 – heterozygous variant, 2 – homozygous variant """ # only for Kozak sequence and stop codon context + transcript_id column columns = [ "K_0", "K_1", "K_2", "K_3", "K_4", "K_5", "K_6", "K_7", "K_8", "K_9", "K_10", "K_11", "K_12", "K_13", "K_14", "S_0", "S_1", "S_2", "S_3", "S_4", "S_5", "S_6", "S_7", "S_8", "S_9", "S_10", "S_11", "S_12", "S_13", "S_14", "transcript_id", "gene_id", "name" ] df_nucleobases = pd.DataFrame(columns=columns) nucleobases_lines = [] mutator = VCFMutator(False, True, vcf, True) contigs = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y' ] for contig in contigs: for transcript in self.transcripts(contig, '+'): if transcript.contains_stop_codon and transcript.contains_start_codon: Kozak_seq = transcript.get_Kozak_seq() Interval_Kozak = Interval( "chr" + transcript.contig, transcript.start_codon_positions[0] - 6, transcript.start_codon_positions[0] + 9, "NA", 0, "+") stop_codon_context = transcript.get_stop_codon_context() Interval_stop = Interval( "chr" + transcript.contig, transcript.stop_codon_positions[0] - 6, transcript.stop_codon_positions[0] + 9, "NA", 0, "+") df_nucleobases_line = mutator.mutate_codon_context( [Interval_Kozak, Interval_stop], [Kozak_seq, stop_codon_context], ["K_", "S_"]) if len(Kozak_seq) < 15: new_columns = [] for column in df_nucleobases_line: if column.find("K_") != -1: new_columns.append("K_" + str( int(column[2:]) + (15 - len(Kozak_seq)))) else: new_columns.append(column) df_nucleobases_line.columns = new_columns df_nucleobases_line["transcript_id"] = transcript.id df_nucleobases_line["gene_id"] = transcript.gene_id nucleobases_lines.append(df_nucleobases_line) for transcript in self.transcripts(contig, '-'): if transcript.contains_stop_codon and transcript.contains_start_codon: Kozak_seq = reverse_complement(transcript.get_Kozak_seq()) Interval_Kozak = Interval( "chr" + transcript.contig, transcript.start_codon_positions[0] - 6, transcript.start_codon_positions[0] + 9, "NA", 0, "-") stop_codon_context = reverse_complement( transcript.get_stop_codon_context()) Interval_stop = Interval( "chr" + transcript.contig, transcript.stop_codon_positions[0] - 6, transcript.stop_codon_positions[0] + 9, "NA", 0, "-") df_nucleobases_line = mutator.mutate_codon_context( [Interval_Kozak, Interval_stop], [Kozak_seq, stop_codon_context], ["K_", "S_"]) if len(Kozak_seq) < 15: new_columns = [] for column in df_nucleobases_line: if column.find("K_") != -1: new_columns.append("K_" + str( int(column[2:]) + (15 - len(Kozak_seq)))) else: new_columns.append(column) df_nucleobases_line.columns = new_columns df_nucleobases_line["transcript_id"] = transcript.id df_nucleobases_line["gene_id"] = transcript.gene_id nucleobases_lines.append(df_nucleobases_line) df_nucleobases = pd.concat(nucleobases_lines, ignore_index=True) df_nucleobases = df_nucleobases.drop(['name'], axis=1) return df_nucleobases
def _draw_gene_annotation(fig, genes, chrom, start, end): wbed = BedTool([Interval(chrom, start, end)]) regions = genes.intersect(wbed, wa=True, u=True) xs = [] ys = [] textxpos = [] textypos = [] names = [] offset = 3.5 - .07 prevends = 0 rangeannot = [] for i, region in enumerate(regions): names.append(region.name) #if region.start >= (prevends +10000): # offset = 3.5-.07 x, y = draw_gene(offset, region) xs += x ys += y textxpos.append(region.end + 500) textypos.append(offset) rangeannot.append( f"{region.chrom}:{region.start}-{region.end};{region.strand}") prevends = max(prevends, region.end) offset -= 0.2 if offset <= 0.0: offset = 3.5 - 0.07 if len(textxpos) > 0: plobjs = [ go.Scatter( x=xs, y=ys, mode="lines", fill="toself", name="Genes", marker=dict(color="goldenrod"), ), go.Scatter( x=textxpos, y=textypos, text=names, mode="text", #opacity=0.0, name="Genes", customdata=rangeannot, hovertemplate="%{text}<br>%{customdata}", showlegend=False, ), #go.Scatter( # x=xs, # y=ys, # mode="lines", # #fill="toself", # #name="Genes", # #marker=dict(color="black"), # line=dict(color='black', width=.1), # showlegend=False, #), ] return plobjs
def vcfrec2interval(record): """Given a VCF record object, return a pybedtools Interval object.""" # NOTE: we need to do coordinate conversion manually return Interval(record.CHROM, record.POS - 1, record.POS)
def flow(self): """Data flow generator.""" refs = np.zeros( (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1, pow(self.bioseq._alphabetsize, self.bioseq.garray.order))) alts = np.zeros_like(refs) vcf = VariantFile(self.variants).fetch() if self.annotation is not None: varbed = BedTool(self.variants) n_vcf_fields = len(varbed[0].fields) vcf_strand_augment = iter( varbed.intersect(self.annotation, loj=True)) try: while True: # construct genomic region names = [] chroms = [] poss = [] rallele = [] aallele = [] ibatch = 0 while ibatch < self.batch_size: rec = next(vcf) rec_strandedness = '+' if self.annotation is not None: rec_aug = next(vcf_strand_augment) rec_strandedness = '-' if '-' in rec_aug[ n_vcf_fields:] else '+' if not self.is_compatible(rec): continue start, end = self.get_interval(rec) names.append(rec.id if rec.id is not None else '') chroms.append(rec.chrom) poss.append(rec.pos - 1) rallele.append(rec.ref.upper()) aallele.append(rec.alts[0].upper()) iref = self.bioseq._getsingleitem( Interval(rec.chrom, start, end)).copy() ialt = iref.copy() for o in range(self.bioseq.garray.order): irefbase = iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] irefbase = irefbase // pow(self.bioseq._alphabetsize, o) irefbase = irefbase % self.bioseq._alphabetsize if self.ignore_reference_match: # process the variant even if # it does not match with the reference base replacement = (NMAP[rec.ref.upper()] - irefbase) * \ pow(self.bioseq._alphabetsize, o) iref[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement replacement = (NMAP[rec.alts[0].upper()] - irefbase) * \ pow(self.bioseq._alphabetsize, o) ialt[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement continue if NMAP[rec.ref.upper()] != irefbase: self.logger.info( 'VCF reference and reference genome not compatible.' 'Expected reference {}, but VCF indicates {}.'. format(irefbase, NMAP[rec.ref.upper()]) + 'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format( rec.chrom, rec.pos, rec.ref, rec.alts[0], rec.id)) else: # at this point, it is ensured that the VCF reference # agrees with the reference genome. replacement = (NMAP[rec.alts[0].upper()] - NMAP[rec.ref.upper()]) * \ pow(self.bioseq._alphabetsize, o) ialt[self.binsize // 2 + o - self.bioseq.garray.order + (0 if self.binsize % 2 == 0 else 1)] += replacement if rec_strandedness == '-': ialt = self.bioseq._revcomp(ialt) iref = self.bioseq._revcomp(iref) alt = as_onehot(ialt[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) alts[ibatch] = alt ref = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) refs[ibatch] = ref ibatch += 1 yield names, chroms, poss, rallele, aallele, refs, alts except StopIteration: refs = refs[:ibatch] alts = alts[:ibatch] yield names, chroms, poss, rallele, aallele, refs, alts
def test_logzscore_normalization(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def loading(garray): garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[Interval('chr2', 0, 300), 0] = np.repeat(100, 300).reshape(-1, 1) return garray from janggu.data.genomicarray import LogTransform, ZScore ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage='ndarray', cache=None, loader=loading) ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage='ndarray', cache=None, loader=loading, normalizer=[LogTransform()]) ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage='ndarray', cache=None, loader=loading, normalizer=[ZScore()]) ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage='ndarray', cache=None, loader=loading, normalizer=[LogTransform(), ZScore()]) ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage='ndarray', cache=None, loader=loading, normalizer=['zscorelog']) for store in ['ndarray', 'hdf5']: ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage=store, cache="cache_file", loader=loading, normalizer=['zscorelog']) np.testing.assert_allclose(ga.weighted_mean(), np.asarray([0.0]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga.weighted_sd(), np.asarray([1.]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga[Interval('chr1', 100, 101)], np.asarray([[[-1.412641340027806]]]), rtol=1e-5, atol=1e-5) np.testing.assert_allclose(ga[Interval('chr2', 100, 101)], np.asarray([[[0.706320670013903]]]), rtol=1e-5, atol=1e-5)
def flow(self): """Data flow generator.""" refs = np.zeros( (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1, pow(self.bioseq._alphabetsize, self.bioseq.garray.order))) alts = np.zeros_like(refs) # get variants vcf = VariantFile(self.variants).fetch() def _get_replacement(new_nucleotide, previous_nucleotide, o): # helper function for replacing old with new nucleotides return (new_nucleotide - previous_nucleotide) * \ pow(self.bioseq._alphabetsize, o) # annotation is used to inform about the strandedness # to evaluate the variant if self.annotation is not None: varbed = BedTool(self.variants) n_vcf_fields = len(varbed[0].fields) vcf_strand_augment = iter( varbed.intersect(self.annotation, loj=True)) try: while True: # construct genomic region names = [] chroms = [] poss = [] rallele = [] aallele = [] ibatch = 0 # prepare mini-batches of variants while ibatch < self.batch_size: rec = next(vcf) rec_strandedness = '+' if self.annotation is not None: rec_aug = next(vcf_strand_augment) rec_strandedness = '-' if '-' in rec_aug[ n_vcf_fields:] else '+' if not self.is_compatible(rec): continue start, end = self.get_interval(rec) names.append(rec.id if rec.id is not None else '') chroms.append(rec.chrom) poss.append(rec.pos - 1) rallele.append(rec.ref.upper()) aallele.append(rec.alts[0].upper()) # obtain the nucleotide indices around the variant iref = self.bioseq._getsingleitem( Interval(rec.chrom, start, end)).copy() ialt = iref.copy() for o in range(self.bioseq.garray.order): # in the loop we adjust the original DNA sequence # by using the alternative alleele instead # # the loop is required for the higher-order nucleotide representation # in which a single variant position affects multiple # mutually overlapping positions in the one-hot encoding # # furthermore, the alternative alleele is only set if # the reference alleele matches with the reference genome. # unless the ignore_reference_match option was used. # this is the positions at which to change the nucleotide position_to_change = self.binsize//2 + o - \ self.bioseq.garray.order + \ (0 if self.binsize%2 == 0 else 1) # determine the reference nucleotide # this would be just irefbase itself for order=1 # but for higher-order representation it needs to # be determined. e.g. for TT for order=2 would be irefbase==15 # which should give the nucleotides 3, 3 irefbase = iref[position_to_change] irefbase = irefbase // pow(self.bioseq._alphabetsize, o) irefbase = irefbase % self.bioseq._alphabetsize if self.ignore_reference_match: # process the variant even if # it does not match with the reference base # replace nucleotides in the reference # and in the alternative alleele iref[position_to_change] += _get_replacement( NMAP[rec.ref.upper()], irefbase, o) ialt[position_to_change] += _get_replacement( NMAP[rec.alts[0].upper()], irefbase, o) continue if NMAP[rec.ref.upper()] != irefbase: self.logger.info( 'VCF reference and reference genome not compatible.' 'Expected reference {}, but VCF indicates {}.'. format(irefbase, NMAP[rec.ref.upper()]) + 'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format( rec.chrom, rec.pos, rec.ref, rec.alts[0], rec.id)) else: # at this point, it is ensured that the VCF reference # agrees with the reference genome. # keep the reference as it is, only change # the alternative alleele ialt[position_to_change] += _get_replacement( NMAP[rec.alts[0].upper()], NMAP[rec.ref.upper()], o) # if the strandedness is negative (from the annotation) # the DNA sequences are reverse complemented if rec_strandedness == '-': ialt = self.bioseq._revcomp(ialt) iref = self.bioseq._revcomp(iref) alt = as_onehot(ialt[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) alts[ibatch] = alt ref = as_onehot(iref[None, :], self.bioseq.garray.order, self.bioseq._alphabetsize) refs[ibatch] = ref ibatch += 1 yield names, chroms, poss, rallele, aallele, refs, alts except StopIteration: refs = refs[:ibatch] alts = alts[:ibatch] yield names, chroms, poss, rallele, aallele, refs, alts
def loading(garray): garray[Interval('chr1', 0, 150), 0] = np.random.normal(loc=10, size=150).reshape(-1, 1) garray[Interval('chr2', 0, 300), 0] = np.random.normal(loc=100, size=300).reshape(-1, 1) return garray
parser.add_argument('--outdir', nargs='?', required=True, type=str, help="Path to the output directory for the plots") parser.add_argument('--plotformat', nargs='?', default='png', type=str, help="Plot format") args = parser.parse_args() region = BedTool([ Interval('chr1', args.start, args.end, strand='+', score='0', name='region') ]) exp2protein = {} with open(args.table) as f: next(f) for l in f: a = l.strip().split("\t") exp2protein[".".join(a[1].split(".")[:-1])] = a[3] #print(exp2protein) peakfiles = [ os.path.join(args.path, f) for f in listdir(args.path) if isfile(os.path.join(args.path, f)) and 'annotated' in f
def test_check_resolution_collapse_compatibility(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def loading(garray): garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1) return garray with pytest.raises(Exception): # Error because resolution=50 but no collapser defined ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage="ndarray", cache=None, resolution=50, loader=loading, collapser=None, normalizer=['tpm']) with pytest.raises(Exception): # Error because resolution=None but no collapser defined ga = create_genomic_array(GenomicIndexer.create_from_genomesize({ 'chr1': 150, 'chr2': 300 }), stranded=False, typecode='float32', storage="ndarray", cache=None, resolution=None, loader=loading, collapser=None, normalizer=['tpm']) ga = create_genomic_array(GenomicIndexer.create_from_file( [ Interval('chr1', 0, 150), Interval('chr2', 0, 150), Interval('chr2', 150, 300) ], binsize=150, stepsize=None, ), stranded=False, typecode='float32', storage="ndarray", cache=None, resolution=1, loader=loading) ga = create_genomic_array(GenomicIndexer.create_from_file( [Interval('chr1', 0, 150), Interval('chr2', 0, 300)], binsize=None, stepsize=None, collapse=True), stranded=False, typecode='float32', storage="ndarray", cache='test', resolution=None, loader=loading, store_whole_genome=None, collapser='sum') ga = create_genomic_array(GenomicIndexer.create_from_file( [Interval('chr1', 0, 150), Interval('chr2', 0, 300)], binsize=None, stepsize=None, collapse=True), stranded=False, typecode='float32', storage="ndarray", cache=None, resolution=None, loader=loading, collapser='sum', normalizer=['tpm'])
def profile_counts_fragments(file, genomicregion, selected_barcodes=None, binsize=50): """ Generates pseudo-bulk tracks. Parameters ---------- file : str Input bam file. genomicregion : str Genomic coordinates. E.g. 'chr1:5000-10000' selected_barcodes : list(str) or None Contains a list of barcodes to consider for the profile. If None, all barcodes are considered. Default=None. binsize : int Resolution of the signal track in bp. Default: 50 Returns ------- anndata.AnnData AnnData object containing the read counts for the given locus. """ bed = BedTool(file) def split_iv(gr): chr_, res = gr.split(':') start, end = res.split('-') return chr_, int(start), int(end) chrom, start, end = split_iv(genomicregion) intersect = bed.intersect(BedTool([Interval(chrom, start, end)]), wa=True) if len(intersect) == 0: raise ValueError(f'No data in {genomicregion}') positions = [] cells = [] barcodemap = OrderedDict() if selected_barcodes is not None: for i, sb in enumerate(selected_barcodes): barcodemap[sb] = i for region in intersect: bar = region.name if selected_barcodes is not None: if bar not in selected_barcodes: # skip barcode if not in selected_barcodes list continue if bar not in barcodemap: barcodemap[bar] = len(barcodemap) if region.start >= start: positions.append(region.start - start) cells.append(barcodemap[bar]) if region.end < end: positions.append(region.end - start) cells.append(barcodemap[bar]) smat = coo_matrix((np.ones(len(positions)), (positions, cells)), shape=(end - start + 1, len(barcodemap)), dtype='int32') # smoothing with binsize resolution data = np.ones((binsize, smat.shape[0])) offsets = np.arange(binsize) di = dia_matrix((data, offsets), shape=(smat.shape[0], smat.shape[0])) smat = di.dot(smat).tocsr() smat = smat[::binsize] var = pd.DataFrame({ 'chrom': [chrom] * int(np.ceil((end - start + 1) / binsize)), 'start': np.arange(start, end + 1, binsize), 'end': np.arange(start + binsize, end + binsize + 1, binsize) }) obs = pd.DataFrame(index=[bc for bc in barcodemap]) adata = AnnData(smat.T.tocsr(), obs=obs, var=var) adata.raw = adata return adata
def loading(garray): garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1) garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1) return garray
def test_fasta_extractor_over_chr_end(): extractor = FastaExtractor('tests/data/fasta_test.fa') intervals = [Interval('chr1', 0, 100), Interval('chr1', 1, 101)] with pytest.raises(ValueError): data = extractor(intervals)
help="Manimum allowed maximum AT content inside a peak") args = parser.parse_args() at_content_dict = coverage2dict(args.attrack) #get gc drops gcdrops = defaultdict(list) with open(args.atdrops) as f: current_drop = [] for l in f: a = l.strip().split("\t") if (not current_drop and a[3] == 'max'): current_drop.append(int(a[1])) if (current_drop and a[3] == 'min'): current_drop.append(int(a[1])) gcdrops[a[0]].append(tuple(current_drop)) current_drop = [] for chrom, positions in gcdrops.items(): for c, (start, end) in enumerate(positions, start=1): score = max(at_content_dict[chrom][start:end]) if (score >= args.minat): sys.stdout.write( str( Interval(chrom, start, end, name="drop_%s_%d" % (chrom, c), score="%1.3f" % score, strand='+')))
def setUp(self): self.file = os.path.join(PATH, self.file) start, end, strand = 9719768, 9739768, "-" self.i = Interval("chr21", start, end, strand=strand) self.start, self.end, self.strand = start, end, strand
for s in sequence: if (s == 'A' and prev == 'T'): count += 1 prev = '' elif (s == 'T' and prev == 'A'): count += 1 prev = '' else: prev = s return count atstretches = BedTool([ Interval(x.chrom, x.start, x.end, x.name, str(count_at_steps(x.attrs['seq'])), x.strand) for x in atstretches if check_motif(x.attrs['seq']) ]) #for interval in atstretches: #interval[5] = str(count_at_steps(interval.attrs['seq'])); #get a division for the at stretches based on their lengthes lengthes = np.array([len(x) for x in atstretches]) l_division = [ 7, 9, 11, 14, 17, 20, 30, 40, 50 ] # [ int(x) for x in sorted(list(set(np.percentile(lengthes, np.linspace(0, 100, 18)))))]; l_division[-1] += 1 #gccounts = [int(x.score) for x in atstretches] #gc_division = [int(x) for x in sorted(set(gccounts))] ac_division = [ 0, 1, 2, 3, 4, 5, 6, 7, 11,