def __init__(self, intervals_file, fasta_file, dnase_file, mappability_file=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_extractor = FastaExtractor(fasta_file) # DNase self.dnase_extractor = BigwigExtractor(dnase_file) # mappability if mappability_file is None: # download the mappability file if not existing mappability_file = os.path.join( this_dir, "../../template/dataloader_files", "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file): print("Downloading the mappability file") urlretrieve( "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", mappability_file) print("Download complete") self.mappability_extractor = BigwigExtractor(mappability_file)
def __getitem__(self, idx): if self.fasta_extractor is None: # Fasta self.fasta_extractor = FastaExtractor(self.fasta_file) # DNase self.dnase_extractor = BigwigExtractor(self.dnase_file) self.mappability_extractor = BigwigExtractor(self.mappability_file) # Get the interval interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: center = (interval.start + interval.stop) // 2 interval.start = center - self.SEQ_WIDTH // 2 interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2 # Get the gencode features gencode_counts = np.array([v[idx].count for k, v in self.overlap_beds], dtype=bool) # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) seq_rc = seq[::-1, ::-1] # Dnase dnase = np.squeeze(self.dnase_extractor([interval], axis=0))[:, np.newaxis] dnase[np.isnan(dnase)] = 0 # NA fill dnase_rc = dnase[::-1] bigwig_list = [seq] bigwig_rc_list = [seq_rc] mappability = np.squeeze(self.mappability_extractor( [interval], axis=0))[:, np.newaxis] mappability[np.isnan(mappability)] = 0 # NA fill mappability_rc = mappability[::-1] bigwig_list.append(mappability) bigwig_rc_list.append(mappability_rc) bigwig_list.append(dnase) bigwig_rc_list.append(dnase_rc) ranges = GenomicRanges.from_interval(interval) ranges_rc = GenomicRanges.from_interval(interval) ranges_rc.strand = "-" return { "inputs": [ np.concatenate(bigwig_list, axis=-1), # stack along the last axis np.concatenate(bigwig_rc_list, axis=-1), # RC version np.append(self.meta_feat, gencode_counts) ], "targets": {}, # No Targets "metadata": { "ranges": ranges, "ranges_rc": ranges_rc } }
def extract_single(self, interval): if self.batch_extractor is None: from genomelake.extractors import BigwigExtractor self.batch_extractor = BigwigExtractor(self.bigwig_file) if self.interval_transform is not None: interval = self.interval_transform(interval) arr = self.batch_extractor([interval], nan_as_zero=self.nan_as_zero)[0] if self.use_strand and interval.strand == '-': arr = arr[::-1] return arr
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_extractor = FastaExtractor(fasta_file) # DNase self.dnase_extractor = BigwigExtractor(dnase_file) # mappability if mappability_file is None: # download the mappability file if not existing mappability_file = os.path.join( this_dir, "../../template/dataloader_files", "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file): print("Downloading the mappability file") urlretrieve( "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", mappability_file) print("Download complete") self.mappability_extractor = BigwigExtractor(mappability_file) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError( "RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line rp = os.path.join(this_dir, "dataloader_files/RNAseq_features/") RNAseq_PC_file = os.path.join(rp, cell_line, "meta.txt") self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def __getitem__(self, idx): if self.fasta_extractor is None: # Fasta self.fasta_extractor = FastaExtractor(self.fasta_file) # DNase self.dnase_extractor = BigwigExtractor(self.dnase_file) # Get the interval interval = self.bt[idx] if interval.stop - interval.start != self.SEQ_WIDTH: center = (interval.start + interval.stop) // 2 interval.start = center - self.SEQ_WIDTH // 2 interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval]), axis=0) seq_rc = seq[::-1, ::-1] # Dnase dnase = np.squeeze(self.dnase_extractor([interval], axis=0))[:, np.newaxis] dnase[np.isnan(dnase)] = 0 # NA fill dnase_rc = dnase[::-1] bigwig_list = [seq] bigwig_rc_list = [seq_rc] bigwig_list.append(dnase) bigwig_rc_list.append(dnase_rc) ranges = GenomicRanges.from_interval(interval) ranges_rc = GenomicRanges.from_interval(interval) ranges_rc.strand = "-" return { "inputs": [ np.concatenate(bigwig_list, axis=-1), # stack along the last axis np.concatenate(bigwig_rc_list, axis=-1), # RC version ], "targets": {}, # No Targets "metadata": { "ranges": ranges, "ranges_rc": ranges_rc } }
def __init__(self, intervals_file, fasta_file, dnase_file, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_extractor = FastaExtractor(fasta_file) # DNase self.dnase_extractor = BigwigExtractor(dnase_file)
class StrandedBigWigExtractor: """Big-wig file extractor NOTE: The extractor is not thread-save. If you with to use it with multiprocessing, create a new extractor object in each process. # Arguments bigwig_file: path to the bigwig file """ def __init__(self, bigwig_file, interval_transform=None, use_strand=False, nan_as_zero=True): self.nan_as_zero = nan_as_zero self.use_strand = use_strand self.bigwig_file = bigwig_file self.interval_transform = interval_transform self.batch_extractor = None def extract_single(self, interval): if self.batch_extractor is None: from genomelake.extractors import BigwigExtractor self.batch_extractor = BigwigExtractor(self.bigwig_file) if self.interval_transform is not None: interval = self.interval_transform(interval) arr = self.batch_extractor([interval], nan_as_zero=self.nan_as_zero)[0] if self.use_strand and interval.strand == '-': arr = arr[::-1] return arr def extract(self, intervals, progbar=False): return np.stack([ self.extract_single(interval) for interval in tqdm(intervals, disable=not progbar) ]) def close(self): return self.batch_extractor.close()
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) self.bigwig_extractors = { a: [BigwigExtractor(f) for f in self.bigwigs[a]] for a in self.bigwigs } interval, labels = self.tsv[idx] interval = resize_interval(interval, 1000) # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) interval_wide = resize_interval(deepcopy(interval), self.track_width) return { "inputs": { "seq": seq }, "targets": { a: sum([e([interval_wide])[0] for e in self.bigwig_extractors[a]]).sum() for a in self.bigwig_extractors }, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)), "ranges_wide": GenomicRanges.from_interval(interval_wide), "name": interval.name } }
# In[4]: # get intervals for day0 data day0_intervals = list(BedTool(data.intervals['day0'])) print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals)) # In[5]: # create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140']) print 'Finished extracting bigwig for day0, 140bp' # In[6]: # create a BigWigExtractor for histone makr 'H3K27ac' for day0 bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac']) print 'Finished extracting bigwig for day0, 140bp' # In[7]: # normalize day0 intervals normalized_day0_intervals = [ normalize_interval(interval, window_size) for interval in day0_intervals if normalize_interval(interval, window_size) ] print 'Finished normalizing day0 intervals!' # In[8]: assert (len(day0_intervals) == len(normalized_day0_intervals)) print "Examples of original intervals"
print '# of Test Intervals: {}'.format(len(test_intervals)) # Get input/output data directories data = Data_Directories() print data.intervals.keys() print data.input_atac[day].keys() print data.output_histone[day].keys() # Extract input candidates # Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length input_candidates = ArrayExtractor(data.input_atac[day][frag]) print 'Finished extracting bigwig for {}, {}bp'.format(day, frag) # Extract output candiates # Create a BigWigExtractor for histone mark of a given day output_candidates = BigwigExtractor(data.output_histone[day][histone]) print 'Finished extracting bigwig for {}, {}'.format(day, histone) # Normalize train intervals normalized_train_intervals = [normalize_interval(interval, window_size) for interval in train_intervals if normalize_interval(interval, window_size)] print 'Finished normalizing train intervals!' # Normalize val intervals normalized_val_intervals = [normalize_interval(interval, window_size) for interval in val_intervals if normalize_interval(interval, window_size)] print 'Finished normalizing val intervals!' # Normalize test intervals normalized_test_intervals = [normalize_interval(interval, window_size) for interval in test_intervals if normalize_interval(interval, window_size)] print 'Finished normalizing test intervals!' # Fetch intervals of sample_num normalized_train_intervals = normalized_train_intervals[:sample_num] normalized_val_intervals = normalized_val_intervals[:int(sample_num*0.2)]
def __getitem__(self, idx): if self.fasta_extractor is None: # Use array extractors if self.bcolz: self.fasta_extractor = ArrayExtractor(self.ds.fasta_file, in_memory=False) self.bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=False), ArrayExtractor(task_spec.neg_counts, in_memory=False) ] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=False), ArrayExtractor(task_spec.neg_counts, in_memory=False) ] for task, task_spec in self.ds.bias_specs.items() if task in self.tasks } else: # Use normal fasta/bigwig extractors assert not self.bcolz # first call self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True) self.bw_extractors = { task: [ BigwigExtractor(task_spec.pos_counts), BigwigExtractor(task_spec.neg_counts) ] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [ BigwigExtractor(task_spec.pos_counts), BigwigExtractor(task_spec.neg_counts) ] for task, task_spec in self.ds.bias_specs.items() } # Setup the intervals interval = Interval( self.dfm.iat[idx, 0], # chrom self.dfm.iat[idx, 1], # start self.dfm.iat[idx, 2]) # end # Transform the input interval (for say augmentation...) if self.interval_transformer is not None: interval = self.interval_transformer(interval) target_interval = resize_interval(deepcopy(interval), self.peak_width) seq_interval = resize_interval(deepcopy(interval), self.seq_width) # This only kicks in when we specify the taskname from dataspec # to the 3rd column. E.g. it doesn't apply when using intervals_file interval_from_task = self.dfm.iat[ idx, 3] if self.intervals_file is None else '' # extract seq + tracks sequence = self.fasta_extractor([seq_interval])[0] if not self.only_classes: if self.taskname_first: cuts = { f"{task}/profile": run_extractors(self.bw_extractors[task], [target_interval], ignore_strand=spec.ignore_strand)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks } else: cuts = { f"profile/{task}": run_extractors(self.bw_extractors[task], [target_interval], ignore_strand=spec.ignore_strand)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks } # Add counts if self.target_transformer is not None: cuts = self.target_transformer.transform(cuts) # Add bias tracks if len(self.ds.bias_specs) > 0: biases = { bias_task: run_extractors(self.bias_bw_extractors[bias_task], [target_interval], ignore_strand=spec.ignore_strand)[0] for bias_task, spec in self.ds.bias_specs.items() } task_biases = { f"bias/{task}/profile": np.concatenate( [biases[bt] for bt in self.task_bias_tracks[task]], axis=-1) for task in self.tasks } if self.target_transformer is not None: for task in self.tasks: task_biases[f'bias/{task}/counts'] = np.log( 1 + task_biases[f'bias/{task}/profile'].sum(0)) # total_count_bias = np.concatenate([np.log(1 + x[k].sum(0)) # for k, x in biases.items()], axis=-1) # task_biases['bias/total_counts'] = total_count_bias if self.profile_bias_pool_size is not None: for task in self.tasks: task_biases[f'bias/{task}/profile'] = np.concatenate( [ moving_average( task_biases[f'bias/{task}/profile'], n=pool_size) for pool_size in to_list( self.profile_bias_pool_size) ], axis=-1) sequence = {"seq": sequence, **task_biases} else: cuts = dict() if self.include_classes: if self.taskname_first: # Get the classes from the tsv file classes = { f"{task}/class": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks } else: classes = { f"class/{task}": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks } cuts = {**cuts, **classes} out = {"inputs": sequence, "targets": cuts} if self.include_metadata: out['metadata'] = { "range": GenomicRanges( chr=target_interval.chrom, start=target_interval.start, end=target_interval.stop, id=idx, strand=(target_interval.strand if target_interval.strand is not None else "*"), ), "interval_from_task": interval_from_task } return out
t1 - t0) #fetch validation inputs (day3, ATAC-seq) t2 = time.time() val_inputs = None if not process_all: normalized_day3_intervals = random.sample(normalized_day3_intervals, sample_num) val_inputs = coarse_normalize_input(bw_140bp_day3(normalized_day3_intervals)) print val_inputs.shape t3 = time.time() print 'Time spent for getting signals of intervals for day3 atac-seq: {}'.format( t3 - t2) # fetch outputs (day0, histone) histone_mark = BigwigExtractor(data.output_histone['day0']['H3K27ac']) outputs = None outputs = histone_mark(normalized_day0_intervals) outputs = np.nan_to_num(outputs) if output_norm_scheme == 'dl': outputs = double_log_transform(outputs) elif output_norm_scheme == 'quant': outputs = quantile_transform(outputs, n_quantiles=50, random_state=7) outputs = np.expand_dims(outputs, axis=2) print 'Output Shape (of one sample): ', outputs[0].shape print 'Expanded Output Shape: ', outputs[0].shape # fetch validation outputs (day3, histone) val_histone_mark = BigwigExtractor(data.output_histone['day3']['H3K27ac']) val_outputs = None val_outputs = val_histone_mark(normalized_day3_intervals)
def __getitem__(self, idx): from pybedtools import Interval if self.fasta_extractor is None: # first call # Use normal fasta/bigwig extractors self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True) self.bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks] for task, task_spec in self.ds.task_specs.items() if task in self.tasks} self.bias_bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks] for task, task_spec in self.ds.bias_specs.items()} # Get the genomic interval for that particular datapoint interval = Interval(self.dfm.iat[idx, 0], # chrom self.dfm.iat[idx, 1], # start self.dfm.iat[idx, 2]) # end # Transform the input interval (for say augmentation...) if self.interval_transformer is not None: interval = self.interval_transformer(interval) # resize the intervals to the desired widths target_interval = resize_interval(deepcopy(interval), self.peak_width) seq_interval = resize_interval(deepcopy(interval), self.seq_width) # This only kicks in when we specify the taskname from dataspec # to the 3rd column. E.g. it doesn't apply when using intervals_file interval_from_task = self.dfm.iat[idx, 3] if self.intervals_file is None else '' # extract DNA sequence + one-hot encode it sequence = self.fasta_extractor([seq_interval])[0] inputs = {"seq": sequence} # exctract the profile counts from the bigwigs cuts = {f"{task}/profile": _run_extractors(self.bw_extractors[task], [target_interval], sum_tracks=spec.sum_tracks)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks} if self.track_transform is not None: for task in self.tasks: cuts[f'{task}/profile'] = self.track_transform(cuts[f'{task}/profile']) # Add total number of counts for task in self.tasks: cuts[f'{task}/counts'] = self.total_count_transform(cuts[f'{task}/profile'].sum(0)) if len(self.ds.bias_specs) > 0: # Extract the bias tracks biases = {bias_task: _run_extractors(self.bias_bw_extractors[bias_task], [target_interval], sum_tracks=spec.sum_tracks)[0] for bias_task, spec in self.ds.bias_specs.items()} task_biases = {f"bias/{task}/profile": np.concatenate([biases[bt] for bt in self.task_bias_tracks[task]], axis=-1) for task in self.tasks} if self.track_transform is not None: for task in self.tasks: task_biases[f'bias/{task}/profile'] = self.track_transform(task_biases[f'bias/{task}/profile']) # Add total number of bias counts for task in self.tasks: task_biases[f'bias/{task}/counts'] = self.total_count_transform(task_biases[f'bias/{task}/profile'].sum(0)) inputs = {**inputs, **task_biases} if self.include_classes: # Optionally, add binary labels from the additional columns in the tsv intervals file classes = {f"{task}/class": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks} cuts = {**cuts, **classes} out = {"inputs": inputs, "targets": cuts} if self.include_metadata: # remember the metadata (what genomic interval was used) out['metadata'] = {"range": GenomicRanges(chr=target_interval.chrom, start=target_interval.start, end=target_interval.stop, id=idx, strand=(target_interval.strand if target_interval.strand is not None else "*"), ), "interval_from_task": interval_from_task} return out
def test_bigwig_extractor(test_bigwig_and_intervals): bw_path, intervals, expected_data = test_bigwig_and_intervals extractor = BigwigExtractor(bw_path) data = extractor(intervals) assert (data == expected_data).all()
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_extractor = FastaExtractor(fasta_file) # DNase self.dnase_extractor = BigwigExtractor(dnase_file) # mappability if mappability_file is None: # download the mappability file if not existing mappability_file = os.path.join( this_dir, "../../template/dataloader_files", "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file): print("Downloading the mappability file") urlretrieve( "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", mappability_file) print("Download complete") self.mappability_extractor = BigwigExtractor(mappability_file) # Gencode features if GENCODE_dir is None: gp = os.path.join(this_dir, "dataloader_files/gencode_features/") else: gp = GENCODE_dir self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError( "RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line rp = os.path.join(this_dir, "dataloader_files/RNAseq_features/") RNAseq_PC_file = os.path.join(rp, cell_line, "meta.txt") self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values