示例#1
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 mappability_file=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_extractor = FastaExtractor(fasta_file)

        # DNase
        self.dnase_extractor = BigwigExtractor(dnase_file)
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            mappability_file = os.path.join(
                this_dir, "../../template/dataloader_files",
                "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file):
                print("Downloading the mappability file")
                urlretrieve(
                    "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                    mappability_file)
                print("Download complete")

        self.mappability_extractor = BigwigExtractor(mappability_file)
示例#2
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Fasta
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            # DNase
            self.dnase_extractor = BigwigExtractor(self.dnase_file)
            self.mappability_extractor = BigwigExtractor(self.mappability_file)

        # Get the interval
        interval = self.bt[idx]
        if interval.stop - interval.start != self.SEQ_WIDTH:
            center = (interval.start + interval.stop) // 2
            interval.start = center - self.SEQ_WIDTH // 2
            interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2
        # Get the gencode features
        gencode_counts = np.array([v[idx].count for k, v in self.overlap_beds],
                                  dtype=bool)

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        seq_rc = seq[::-1, ::-1]

        # Dnase
        dnase = np.squeeze(self.dnase_extractor([interval],
                                                axis=0))[:, np.newaxis]
        dnase[np.isnan(dnase)] = 0  # NA fill
        dnase_rc = dnase[::-1]

        bigwig_list = [seq]
        bigwig_rc_list = [seq_rc]
        mappability = np.squeeze(self.mappability_extractor(
            [interval], axis=0))[:, np.newaxis]
        mappability[np.isnan(mappability)] = 0  # NA fill
        mappability_rc = mappability[::-1]
        bigwig_list.append(mappability)
        bigwig_rc_list.append(mappability_rc)
        bigwig_list.append(dnase)
        bigwig_rc_list.append(dnase_rc)

        ranges = GenomicRanges.from_interval(interval)
        ranges_rc = GenomicRanges.from_interval(interval)
        ranges_rc.strand = "-"

        return {
            "inputs": [
                np.concatenate(bigwig_list,
                               axis=-1),  # stack along the last axis
                np.concatenate(bigwig_rc_list, axis=-1),  # RC version
                np.append(self.meta_feat, gencode_counts)
            ],
            "targets": {},  # No Targets
            "metadata": {
                "ranges": ranges,
                "ranges_rc": ranges_rc
            }
        }
示例#3
0
    def extract_single(self, interval):
        if self.batch_extractor is None:
            from genomelake.extractors import BigwigExtractor
            self.batch_extractor = BigwigExtractor(self.bigwig_file)

        if self.interval_transform is not None:
            interval = self.interval_transform(interval)
        arr = self.batch_extractor([interval], nan_as_zero=self.nan_as_zero)[0]
        if self.use_strand and interval.strand == '-':
            arr = arr[::-1]
        return arr
示例#4
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_extractor = FastaExtractor(fasta_file)

        # DNase
        self.dnase_extractor = BigwigExtractor(dnase_file)
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            mappability_file = os.path.join(
                this_dir, "../../template/dataloader_files",
                "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file):
                print("Downloading the mappability file")
                urlretrieve(
                    "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                    mappability_file)
                print("Download complete")

        self.mappability_extractor = BigwigExtractor(mappability_file)
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError(
                    "RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            rp = os.path.join(this_dir, "dataloader_files/RNAseq_features/")
            RNAseq_PC_file = os.path.join(rp, cell_line, "meta.txt")
        self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t",
                                     header=None)[0].values
示例#5
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Fasta
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            # DNase
            self.dnase_extractor = BigwigExtractor(self.dnase_file)

        # Get the interval
        interval = self.bt[idx]
        if interval.stop - interval.start != self.SEQ_WIDTH:
            center = (interval.start + interval.stop) // 2
            interval.start = center - self.SEQ_WIDTH // 2
            interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        seq_rc = seq[::-1, ::-1]

        # Dnase
        dnase = np.squeeze(self.dnase_extractor([interval],
                                                axis=0))[:, np.newaxis]
        dnase[np.isnan(dnase)] = 0  # NA fill
        dnase_rc = dnase[::-1]

        bigwig_list = [seq]
        bigwig_rc_list = [seq_rc]
        bigwig_list.append(dnase)
        bigwig_rc_list.append(dnase_rc)

        ranges = GenomicRanges.from_interval(interval)
        ranges_rc = GenomicRanges.from_interval(interval)
        ranges_rc.strand = "-"

        return {
            "inputs": [
                np.concatenate(bigwig_list,
                               axis=-1),  # stack along the last axis
                np.concatenate(bigwig_rc_list, axis=-1),  # RC version
            ],
            "targets": {},  # No Targets
            "metadata": {
                "ranges": ranges,
                "ranges_rc": ranges_rc
            }
        }
示例#6
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_extractor = FastaExtractor(fasta_file)

        # DNase
        self.dnase_extractor = BigwigExtractor(dnase_file)
示例#7
0
class StrandedBigWigExtractor:
    """Big-wig file extractor

    NOTE: The extractor is not thread-save.
    If you with to use it with multiprocessing,
    create a new extractor object in each process.

    # Arguments
      bigwig_file: path to the bigwig file
    """
    def __init__(self,
                 bigwig_file,
                 interval_transform=None,
                 use_strand=False,
                 nan_as_zero=True):
        self.nan_as_zero = nan_as_zero
        self.use_strand = use_strand
        self.bigwig_file = bigwig_file
        self.interval_transform = interval_transform
        self.batch_extractor = None

    def extract_single(self, interval):
        if self.batch_extractor is None:
            from genomelake.extractors import BigwigExtractor
            self.batch_extractor = BigwigExtractor(self.bigwig_file)

        if self.interval_transform is not None:
            interval = self.interval_transform(interval)
        arr = self.batch_extractor([interval], nan_as_zero=self.nan_as_zero)[0]
        if self.use_strand and interval.strand == '-':
            arr = arr[::-1]
        return arr

    def extract(self, intervals, progbar=False):
        return np.stack([
            self.extract_single(interval)
            for interval in tqdm(intervals, disable=not progbar)
        ])

    def close(self):
        return self.batch_extractor.close()
示例#8
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {
                a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                for a in self.bigwigs
            }

        interval, labels = self.tsv[idx]
        interval = resize_interval(interval, 1000)
        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        interval_wide = resize_interval(deepcopy(interval), self.track_width)

        return {
            "inputs": {
                "seq": seq
            },
            "targets": {
                a:
                sum([e([interval_wide])[0]
                     for e in self.bigwig_extractors[a]]).sum()
                for a in self.bigwig_extractors
            },
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx)),
                "ranges_wide":
                GenomicRanges.from_interval(interval_wide),
                "name":
                interval.name
            }
        }
示例#9
0
# In[4]:

# get intervals for day0 data
day0_intervals = list(BedTool(data.intervals['day0']))
print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals))

# In[5]:

# create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs
bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140'])
print 'Finished extracting bigwig for day0, 140bp'

# In[6]:

# create a BigWigExtractor for histone makr 'H3K27ac' for day0
bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac'])
print 'Finished extracting bigwig for day0, 140bp'

# In[7]:

# normalize day0 intervals
normalized_day0_intervals = [
    normalize_interval(interval, window_size) for interval in day0_intervals
    if normalize_interval(interval, window_size)
]
print 'Finished normalizing day0 intervals!'

# In[8]:

assert (len(day0_intervals) == len(normalized_day0_intervals))
print "Examples of original intervals"
示例#10
0
print '# of Test Intervals: {}'.format(len(test_intervals))

# Get input/output data directories
data = Data_Directories()
print data.intervals.keys()
print data.input_atac[day].keys()
print data.output_histone[day].keys()

# Extract input candidates
# Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length
input_candidates = ArrayExtractor(data.input_atac[day][frag])
print 'Finished extracting bigwig for {}, {}bp'.format(day, frag)

# Extract output candiates
# Create a BigWigExtractor for histone mark of a given day
output_candidates = BigwigExtractor(data.output_histone[day][histone])
print 'Finished extracting bigwig for {}, {}'.format(day, histone)

# Normalize train intervals
normalized_train_intervals = [normalize_interval(interval, window_size) for interval in train_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing train intervals!'
# Normalize val intervals
normalized_val_intervals = [normalize_interval(interval, window_size) for interval in val_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing val intervals!'
# Normalize test intervals
normalized_test_intervals = [normalize_interval(interval, window_size) for interval in test_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing test intervals!'

# Fetch intervals of sample_num
normalized_train_intervals = normalized_train_intervals[:sample_num]
normalized_val_intervals = normalized_val_intervals[:int(sample_num*0.2)]
示例#11
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Use array extractors
            if self.bcolz:
                self.fasta_extractor = ArrayExtractor(self.ds.fasta_file,
                                                      in_memory=False)
                self.bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                    if task in self.tasks
                }
            else:
                # Use normal fasta/bigwig extractors
                assert not self.bcolz
                # first call
                self.fasta_extractor = FastaExtractor(self.ds.fasta_file,
                                                      use_strand=True)
                self.bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                }

        # Setup the intervals
        interval = Interval(
            self.dfm.iat[idx, 0],  # chrom
            self.dfm.iat[idx, 1],  # start
            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[
            idx, 3] if self.intervals_file is None else ''

        # extract seq + tracks
        sequence = self.fasta_extractor([seq_interval])[0]

        if not self.only_classes:
            if self.taskname_first:
                cuts = {
                    f"{task}/profile":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
            else:
                cuts = {
                    f"profile/{task}":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }

            # Add counts
            if self.target_transformer is not None:
                cuts = self.target_transformer.transform(cuts)

            # Add bias tracks
            if len(self.ds.bias_specs) > 0:

                biases = {
                    bias_task:
                    run_extractors(self.bias_bw_extractors[bias_task],
                                   [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for bias_task, spec in self.ds.bias_specs.items()
                }

                task_biases = {
                    f"bias/{task}/profile": np.concatenate(
                        [biases[bt] for bt in self.task_bias_tracks[task]],
                        axis=-1)
                    for task in self.tasks
                }

                if self.target_transformer is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/counts'] = np.log(
                            1 + task_biases[f'bias/{task}/profile'].sum(0))
                    # total_count_bias = np.concatenate([np.log(1 + x[k].sum(0))
                    #                                    for k, x in biases.items()], axis=-1)
                    # task_biases['bias/total_counts'] = total_count_bias

                if self.profile_bias_pool_size is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/profile'] = np.concatenate(
                            [
                                moving_average(
                                    task_biases[f'bias/{task}/profile'],
                                    n=pool_size) for pool_size in to_list(
                                        self.profile_bias_pool_size)
                            ],
                            axis=-1)

                sequence = {"seq": sequence, **task_biases}
        else:
            cuts = dict()

        if self.include_classes:
            if self.taskname_first:
                # Get the classes from the tsv file
                classes = {
                    f"{task}/class": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            else:
                classes = {
                    f"class/{task}": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            cuts = {**cuts, **classes}

        out = {"inputs": sequence, "targets": cuts}

        if self.include_metadata:
            out['metadata'] = {
                "range":
                GenomicRanges(
                    chr=target_interval.chrom,
                    start=target_interval.start,
                    end=target_interval.stop,
                    id=idx,
                    strand=(target_interval.strand
                            if target_interval.strand is not None else "*"),
                ),
                "interval_from_task":
                interval_from_task
            }
        return out
示例#12
0
    t1 - t0)

#fetch validation inputs (day3, ATAC-seq)
t2 = time.time()
val_inputs = None
if not process_all:
    normalized_day3_intervals = random.sample(normalized_day3_intervals,
                                              sample_num)
val_inputs = coarse_normalize_input(bw_140bp_day3(normalized_day3_intervals))
print val_inputs.shape
t3 = time.time()
print 'Time spent for getting signals of intervals for day3 atac-seq: {}'.format(
    t3 - t2)

# fetch outputs (day0, histone)
histone_mark = BigwigExtractor(data.output_histone['day0']['H3K27ac'])
outputs = None
outputs = histone_mark(normalized_day0_intervals)
outputs = np.nan_to_num(outputs)
if output_norm_scheme == 'dl':
    outputs = double_log_transform(outputs)
elif output_norm_scheme == 'quant':
    outputs = quantile_transform(outputs, n_quantiles=50, random_state=7)
outputs = np.expand_dims(outputs, axis=2)
print 'Output Shape (of one sample): ', outputs[0].shape
print 'Expanded Output Shape: ', outputs[0].shape

# fetch validation outputs (day3, histone)
val_histone_mark = BigwigExtractor(data.output_histone['day3']['H3K27ac'])
val_outputs = None
val_outputs = val_histone_mark(normalized_day3_intervals)
示例#13
0
    def __getitem__(self, idx):
        from pybedtools import Interval

        if self.fasta_extractor is None:
            # first call
            # Use normal fasta/bigwig extractors
            self.fasta_extractor = FastaExtractor(self.ds.fasta_file, use_strand=True)

            self.bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks]
                                  for task, task_spec in self.ds.task_specs.items() if task in self.tasks}

            self.bias_bw_extractors = {task: [BigwigExtractor(track) for track in task_spec.tracks]
                                       for task, task_spec in self.ds.bias_specs.items()}

        # Get the genomic interval for that particular datapoint
        interval = Interval(self.dfm.iat[idx, 0],  # chrom
                            self.dfm.iat[idx, 1],  # start
                            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        # resize the intervals to the desired widths
        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[idx, 3] if self.intervals_file is None else ''

        # extract DNA sequence + one-hot encode it
        sequence = self.fasta_extractor([seq_interval])[0]
        inputs = {"seq": sequence}

        # exctract the profile counts from the bigwigs
        cuts = {f"{task}/profile": _run_extractors(self.bw_extractors[task],
                                                   [target_interval],
                                                   sum_tracks=spec.sum_tracks)[0]
                for task, spec in self.ds.task_specs.items() if task in self.tasks}
        if self.track_transform is not None:
            for task in self.tasks:
                cuts[f'{task}/profile'] = self.track_transform(cuts[f'{task}/profile'])

        # Add total number of counts
        for task in self.tasks:
            cuts[f'{task}/counts'] = self.total_count_transform(cuts[f'{task}/profile'].sum(0))

        if len(self.ds.bias_specs) > 0:
            # Extract the bias tracks
            biases = {bias_task: _run_extractors(self.bias_bw_extractors[bias_task],
                                                 [target_interval],
                                                 sum_tracks=spec.sum_tracks)[0]
                      for bias_task, spec in self.ds.bias_specs.items()}

            task_biases = {f"bias/{task}/profile": np.concatenate([biases[bt]
                                                                   for bt in self.task_bias_tracks[task]],
                                                                  axis=-1)
                           for task in self.tasks}

            if self.track_transform is not None:
                for task in self.tasks:
                    task_biases[f'bias/{task}/profile'] = self.track_transform(task_biases[f'bias/{task}/profile'])

            # Add total number of bias counts
            for task in self.tasks:
                task_biases[f'bias/{task}/counts'] = self.total_count_transform(task_biases[f'bias/{task}/profile'].sum(0))

            inputs = {**inputs, **task_biases}

        if self.include_classes:
            # Optionally, add binary labels from the additional columns in the tsv intervals file
            classes = {f"{task}/class": self.dfm.iat[idx, i + 3]
                       for i, task in enumerate(self.dfm_tasks) if task in self.tasks}
            cuts = {**cuts, **classes}

        out = {"inputs": inputs,
               "targets": cuts}

        if self.include_metadata:
            # remember the metadata (what genomic interval was used)
            out['metadata'] = {"range": GenomicRanges(chr=target_interval.chrom,
                                                      start=target_interval.start,
                                                      end=target_interval.stop,
                                                      id=idx,
                                                      strand=(target_interval.strand
                                                              if target_interval.strand is not None
                                                              else "*"),
                                                      ),
                               "interval_from_task": interval_from_task}
        return out
def test_bigwig_extractor(test_bigwig_and_intervals):
    bw_path, intervals, expected_data = test_bigwig_and_intervals
    extractor = BigwigExtractor(bw_path)
    data = extractor(intervals)
    assert (data == expected_data).all()
示例#15
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 GENCODE_dir=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_extractor = FastaExtractor(fasta_file)

        # DNase
        self.dnase_extractor = BigwigExtractor(dnase_file)
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            mappability_file = os.path.join(
                this_dir, "../../template/dataloader_files",
                "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file):
                print("Downloading the mappability file")
                urlretrieve(
                    "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                    mappability_file)
                print("Download complete")

        self.mappability_extractor = BigwigExtractor(mappability_file)
        # Gencode features
        if GENCODE_dir is None:
            gp = os.path.join(this_dir, "dataloader_files/gencode_features/")
        else:
            gp = GENCODE_dir
        self.gencode_beds = [
            ("cpg", BedTool(gp + '/cpgisland.bed.gz')),
            ("cds",
             BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')),
            ("intron",
             BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')),
            ("promoter",
             BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')),
            ("utr5",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')),
            ("utr3",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')),
        ]
        # Overlap beds - could be done incrementally
        print("Overlapping all the bed-files")
        # The BT() and .fn are there in order to leverage BedToolLinecache
        self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn))
                             for b, v in self.gencode_beds]
        print("Assesing the file")
        assert len(self.overlap_beds[1][1]) == len(self.bt)
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError(
                    "RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            rp = os.path.join(this_dir, "dataloader_files/RNAseq_features/")
            RNAseq_PC_file = os.path.join(rp, cell_line, "meta.txt")
        self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t",
                                     header=None)[0].values