def extractor(intervals_file, input_data_sources, target_data_sources=None, batch_size=128): """BatchGenerator Args: intervals_file: tsv file Assumes bed-like `chrom start end id` format. input_data_sources: dict mapping from input name to genomelake directory target_data_sources: dict, optional mapping from input name to genomelake directory batch_size: int """ bt = pybedtools.BedTool(intervals_file) input_data_extractors = { key: ArrayExtractor(data_source) for key, data_source in input_data_sources.items() } if target_data_sources is not None: target_data_extractors = { key: ArrayExtractor(data_source) for key, data_source in target_data_sources.items() } intervals_generator = batch_iter(bt, batch_size) for intervals_batch in intervals_generator: out = {} # get data out['inputs'] = { key: extractor(intervals_batch)[..., None] # adds channel axis for conv1d for key, extractor in input_data_extractors.items() } if target_data_sources is not None: out['targets'] = { key: extractor(intervals_batch)[ ..., None] # adds channel axis for conv1d for key, extractor in target_data_extractors.items() } # get metadata out['metadata'] = {} chrom = [] start = [] end = [] ids = [] for interval in intervals_batch: chrom.append(interval.chrom) start.append(interval.start) end.append(interval.stop) ids.append(interval.name) out['metadata']['ranges'] = {} out['metadata']['ranges']['chr'] = np.array(chrom) out['metadata']['ranges']['start'] = np.array(start) out['metadata']['ranges']['end'] = np.array(end) out['metadata']['ranges']['id'] = np.array(ids) yield out
def test_array_extractor_fasta(mode, in_memory): data_dir = 'tests/data/fasta_test_dir_{}_{}'.format(mode, in_memory) backend.extract_fasta_to_file( 'tests/data/fasta_test.fa', data_dir, mode=mode, overwrite=True) extractor = ArrayExtractor(data_dir, in_memory=in_memory) intervals = [Interval('chr1', 0, 10), Interval('chr2', 0, 10)] expected_data = np.array( [[[ 1. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 0. , 1. , 0. ], [ 0. , 0. , 0. , 1. ], [ 1. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 0. , 1. , 0. ], [ 0. , 0. , 0. , 1. ]], [[ 1. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 0. , 1. , 0. ], [ 0. , 0. , 0. , 1. ], [ 0.25, 0.25, 0.25, 0.25], [ 1. , 0. , 0. , 0. ], [ 0. , 1. , 0. , 0. ], [ 0. , 0. , 1. , 0. ], [ 0. , 0. , 0. , 1. ], [ 0.25, 0.25, 0.25, 0.25]]], dtype=np.float32) data = extractor(intervals) assert (data == expected_data).all()
def test_array_extractor_bigwig(test_bigwig_and_intervals, mode, in_memory): bw_path, intervals, expected_data = test_bigwig_and_intervals bw_dir_path = "{}.dir".format(bw_path) backend.extract_bigwig_to_file( bw_path, bw_dir_path, mode=mode, overwrite=True) extractor = ArrayExtractor(bw_dir_path, in_memory=in_memory) data = extractor(intervals) assert (data == expected_data).all()
# retrieve data data = Data_Directories() print data.intervals.keys() print data.input_atac['day0'].keys() print data.output_histone['day0'].keys() # In[4]: # get intervals for day0 data day0_intervals = list(BedTool(data.intervals['day0'])) print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals)) # In[5]: # create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140']) print 'Finished extracting bigwig for day0, 140bp' # In[6]: # create a BigWigExtractor for histone makr 'H3K27ac' for day0 bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac']) print 'Finished extracting bigwig for day0, 140bp' # In[7]: # normalize day0 intervals normalized_day0_intervals = [ normalize_interval(interval, window_size) for interval in day0_intervals if normalize_interval(interval, window_size) ]
train_intervals = list(BedTool(train_dir)) val_intervals = list(BedTool(val_dir)) test_intervals = list(BedTool(test_dir)) print '# of Train Intervals: {}'.format(len(train_intervals)) print '# of Val Intervals: {}'.format(len(val_intervals)) print '# of Test Intervals: {}'.format(len(test_intervals)) # Get input/output data directories data = Data_Directories() print data.intervals.keys() print data.input_atac[day].keys() print data.output_histone[day].keys() # Extract input candidates # Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length input_candidates = ArrayExtractor(data.input_atac[day][frag]) print 'Finished extracting bigwig for {}, {}bp'.format(day, frag) # Extract output candiates # Create a BigWigExtractor for histone mark of a given day output_candidates = BigwigExtractor(data.output_histone[day][histone]) print 'Finished extracting bigwig for {}, {}'.format(day, histone) # Normalize train intervals normalized_train_intervals = [normalize_interval(interval, window_size) for interval in train_intervals if normalize_interval(interval, window_size)] print 'Finished normalizing train intervals!' # Normalize val intervals normalized_val_intervals = [normalize_interval(interval, window_size) for interval in val_intervals if normalize_interval(interval, window_size)] print 'Finished normalizing val intervals!' # Normalize test intervals normalized_test_intervals = [normalize_interval(interval, window_size) for interval in test_intervals if normalize_interval(interval, window_size)]
def __getitem__(self, idx): if self.fasta_extractor is None: # Use array extractors if self.bcolz: self.fasta_extractor = ArrayExtractor(self.ds.fasta_file, in_memory=False) self.bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=False), ArrayExtractor(task_spec.neg_counts, in_memory=False) ] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=False), ArrayExtractor(task_spec.neg_counts, in_memory=False) ] for task, task_spec in self.ds.bias_specs.items() if task in self.tasks } else: # Use normal fasta/bigwig extractors assert not self.bcolz # first call self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True) self.bw_extractors = { task: [ BigwigExtractor(task_spec.pos_counts), BigwigExtractor(task_spec.neg_counts) ] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [ BigwigExtractor(task_spec.pos_counts), BigwigExtractor(task_spec.neg_counts) ] for task, task_spec in self.ds.bias_specs.items() } # Setup the intervals interval = Interval( self.dfm.iat[idx, 0], # chrom self.dfm.iat[idx, 1], # start self.dfm.iat[idx, 2]) # end # Transform the input interval (for say augmentation...) if self.interval_transformer is not None: interval = self.interval_transformer(interval) target_interval = resize_interval(deepcopy(interval), self.peak_width) seq_interval = resize_interval(deepcopy(interval), self.seq_width) # This only kicks in when we specify the taskname from dataspec # to the 3rd column. E.g. it doesn't apply when using intervals_file interval_from_task = self.dfm.iat[ idx, 3] if self.intervals_file is None else '' # extract seq + tracks sequence = self.fasta_extractor([seq_interval])[0] if not self.only_classes: if self.taskname_first: cuts = { f"{task}/profile": run_extractors(self.bw_extractors[task], [target_interval], ignore_strand=spec.ignore_strand)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks } else: cuts = { f"profile/{task}": run_extractors(self.bw_extractors[task], [target_interval], ignore_strand=spec.ignore_strand)[0] for task, spec in self.ds.task_specs.items() if task in self.tasks } # Add counts if self.target_transformer is not None: cuts = self.target_transformer.transform(cuts) # Add bias tracks if len(self.ds.bias_specs) > 0: biases = { bias_task: run_extractors(self.bias_bw_extractors[bias_task], [target_interval], ignore_strand=spec.ignore_strand)[0] for bias_task, spec in self.ds.bias_specs.items() } task_biases = { f"bias/{task}/profile": np.concatenate( [biases[bt] for bt in self.task_bias_tracks[task]], axis=-1) for task in self.tasks } if self.target_transformer is not None: for task in self.tasks: task_biases[f'bias/{task}/counts'] = np.log( 1 + task_biases[f'bias/{task}/profile'].sum(0)) # total_count_bias = np.concatenate([np.log(1 + x[k].sum(0)) # for k, x in biases.items()], axis=-1) # task_biases['bias/total_counts'] = total_count_bias if self.profile_bias_pool_size is not None: for task in self.tasks: task_biases[f'bias/{task}/profile'] = np.concatenate( [ moving_average( task_biases[f'bias/{task}/profile'], n=pool_size) for pool_size in to_list( self.profile_bias_pool_size) ], axis=-1) sequence = {"seq": sequence, **task_biases} else: cuts = dict() if self.include_classes: if self.taskname_first: # Get the classes from the tsv file classes = { f"{task}/class": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks } else: classes = { f"class/{task}": self.dfm.iat[idx, i + 3] for i, task in enumerate(self.dfm_tasks) if task in self.tasks } cuts = {**cuts, **classes} out = {"inputs": sequence, "targets": cuts} if self.include_metadata: out['metadata'] = { "range": GenomicRanges( chr=target_interval.chrom, start=target_interval.start, end=target_interval.stop, id=idx, strand=(target_interval.strand if target_interval.strand is not None else "*"), ), "interval_from_task": interval_from_task } return out
def __init__(self, ds, peak_width=200, seq_width=None, incl_chromosomes=None, excl_chromosomes=None, intervals_file=None, bcolz=False, in_memory=False, include_metadata=True, taskname_first=False, tasks=None, include_classes=False, only_classes=False, shuffle=True, interval_transformer=None, target_transformer=None, profile_bias_pool_size=None): """Dataset for loading the bigwigs and fastas Args: ds (basepair.src.schemas.DataSpec): data specification containing the fasta file, bed files and bigWig file paths chromosomes (list of str): a list of chor peak_width: resize the bed file to a certain width intervals_file: if specified, use these regions to train the model. If not specified, the regions are inferred from the dataspec. only_classes: if True, load only classes bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True shuffle: True preprocessor: trained preprocessor object containing the .transform methods """ if isinstance(ds, str): self.ds = DataSpec.load(ds) else: self.ds = ds self.peak_width = peak_width if seq_width is None: self.seq_width = peak_width else: self.seq_width = seq_width self.shuffle = shuffle self.intervals_file = intervals_file self.incl_chromosomes = incl_chromosomes self.excl_chromosomes = excl_chromosomes self.target_transformer = target_transformer self.include_classes = include_classes self.only_classes = only_classes self.taskname_first = taskname_first if self.only_classes: assert self.include_classes self.profile_bias_pool_size = profile_bias_pool_size # not specified yet self.fasta_extractor = None self.bw_extractors = None self.bias_bw_extractors = None self.include_metadata = include_metadata self.interval_transformer = interval_transformer self.bcolz = bcolz self.in_memory = in_memory if not self.bcolz and self.in_memory: raise ValueError( "in_memory option only applicable when bcolz=True") # Load chromosome lengths if self.bcolz: p = json.loads( (Path(self.ds.fasta_file) / "metadata.json").read_text()) self.chrom_lens = {c: v[0] for c, v in p['file_shapes'].items()} else: fa = FastaFile(self.ds.fasta_file) self.chrom_lens = { name: l for name, l in zip(fa.references, fa.lengths) } if len(self.chrom_lens) == 0: raise ValueError( f"no chromosomes found in fasta file: {self.ds.fasta_file}. " "Make sure the file path is correct and that the fasta index file {self.ds.fasta_file}.fai is up to date" ) del fa if self.intervals_file is None: self.dfm = load_beds(bed_files={ task: task_spec.peaks for task, task_spec in self.ds.task_specs.items() if task_spec.peaks is not None }, chromosome_lens=self.chrom_lens, excl_chromosomes=self.excl_chromosomes, incl_chromosomes=self.incl_chromosomes, resize_width=max(self.peak_width, self.seq_width)) assert list( self.dfm.columns)[:4] == ["chrom", "start", "end", "task"] if self.shuffle: self.dfm = self.dfm.sample(frac=1) self.tsv = None self.dfm_tasks = None else: self.tsv = TsvReader(self.intervals_file, num_chr=False, label_dtype=int, mask_ambigous=-1, incl_chromosomes=incl_chromosomes, excl_chromosomes=excl_chromosomes, chromosome_lens=self.chrom_lens, resize_width=max(self.peak_width, self.seq_width)) if self.shuffle: self.tsv.shuffle_inplace() self.dfm = self.tsv.df # use the data-frame from tsv self.dfm_tasks = self.tsv.get_target_names() # remember the tasks if tasks is None: self.tasks = list(self.ds.task_specs) else: self.tasks = tasks if self.bcolz and self.in_memory: self.fasta_extractor = ArrayExtractor(self.ds.fasta_file, in_memory=True) self.bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=True), ArrayExtractor(task_spec.neg_counts, in_memory=True) ] for task, task_spec in self.ds.task_specs.items() if task in self.tasks } self.bias_bw_extractors = { task: [ ArrayExtractor(task_spec.pos_counts, in_memory=True), ArrayExtractor(task_spec.neg_counts, in_memory=True) ] for task, task_spec in self.ds.bias_specs.items() if task in self.tasks } if self.include_classes: assert self.dfm_tasks is not None if self.dfm_tasks is not None: assert set(self.tasks).issubset(self.dfm_tasks) # setup bias maps per task self.task_bias_tracks = { task: [ bias for bias, spec in self.ds.bias_specs.items() if task in spec.tasks ] for task in self.tasks }
def extractor(intervals_file, input_data_sources, target_data_sources=None, batch_size=128): """BatchGenerator Args: intervals_file: tsv file Assumes bed-like `chrom start end id` format. input_data_sources: dict mapping from input name to genomelake directory target_data_sources: dict, optional mapping from input name to genomelake directory batch_size: int """ if not isinstance(input_data_sources, dict): import zipfile if zipfile.is_zipfile(input_data_sources): input_data_sources = inflate_data_sources(input_data_sources) else: raise Exception( "input_data_sources has to be a python direct or the path to a zipped directory!" ) bt = pybedtools.BedTool(intervals_file) input_data_extractors = { key: ArrayExtractor(data_source) for key, data_source in input_data_sources.items() } if target_data_sources is not None: target_data_extractors = { key: ArrayExtractor(data_source) for key, data_source in target_data_sources.items() } intervals_generator = batch_iter(bt, batch_size) for intervals_batch in intervals_generator: out = {} # get data out['inputs'] = { key: extractor(intervals_batch)[..., None] # adds channel axis for conv1d for key, extractor in input_data_extractors.items() } if target_data_sources is not None: out['targets'] = { key: extractor(intervals_batch)[ ..., None] # adds channel axis for conv1d for key, extractor in target_data_extractors.items() } # get metadata out['metadata'] = {} chrom = [] start = [] end = [] ids = [] for interval in intervals_batch: chrom.append(interval.chrom) start.append(interval.start) end.append(interval.stop) ids.append(interval.name) out['metadata'] = { 'ranges': GenomicRanges(chr=np.array(chrom), start=np.array(start), end=np.array(end), id=np.array(id)) } yield out