def test_gaps(): hg19 = Genome("hg19", chromosomes=["chr1"]) assert "chr1" in hg19 assert "chr2" not in hg19 filled = hg19.filled(chromosomes=["chr1"]) hg19.bed_to_sequence(filled) hg19.delete()
def get_gaps_statistics( genome: Genome, max_gap_size: int, window_size: int) -> Tuple[int, np.ndarray, np.ndarray]: """Return number, mean and covariance of gaps. Parameters -------------------------- genome:Genome, The genome to use. max_gap_size:int, The maximum gap size to take in consideration. window_size:int The target window size Returns -------------------------- Returns Tuple containing number of gaps, mean and covariance. """ # Obtaining gaps gaps = genome.gaps() # Getting gaps whose size is below given threshold gaps = gaps[gaps.chromEnd - gaps.chromStart < max_gap_size] # Expanding gaps to given window size gaps = expand_bed_regions(gaps, window_size, alignment="center") # Retrieving the sequences corresponding to given gaps sequences = genome.bed_to_sequence(gaps).sequence.str.lower() # Obtaining a mask of gaps gaps_mask = np.array([list(sequence) for sequence in sequences]) == "n" number = len(gaps_mask) mean = gaps_mask.mean(axis=0) covariance = np.cov(gaps_mask.T) return number, mean, covariance
def __init__(self, genome: Genome, bed: pd.DataFrame, batch_size: int, nucleotides: str = "actg", unknown_nucleotide_value: float = 0.25, random_state: int = 42, elapsed_epochs: int = 0, shuffle: bool = True): """Return new BedSequence object. Parameters -------------------- genome: Genome, Genomic assembly from ucsc from which to extract sequences. bed: pd.DataFrame, Pandas DataFrame containing minimal bed columns, like "chrom", "chromStart" and "chromEnd". batch_size: int, Batch size to be returned for each request. nucleotides: str = "actg", Nucleotides to consider when one-hot encoding. unknown_nucleotide_value: float = 0.25, The default value to use for encoding unknown nucleotides. random_state: int = 42, Starting random_state to use if shuffling the dataset. elapsed_epochs: int = 0, Number of elapsed epochs to init state of generator. shuffle: bool = True, Wethever to shuffle or not the sequence. Raises -------------------- ValueError: If the bed file regions does not have the same length. """ # Every window in the bed file must be # of the same length. if len(set((bed.chromEnd - bed.chromStart).values)) != 1: raise ValueError("The bed file regions must have the same length!") self._window_length = (bed.chromEnd - bed.chromStart).values[0] self._nucleotides = nucleotides self._nucleotides_number = len(nucleotides) self._unknown_nucleotide_value = unknown_nucleotide_value # We extract the sequences of the bed file from # the given genome. sequences = np.array(genome.bed_to_sequence(bed), dtype=str) super().__init__(nucleotides_to_numbers(self.nucleotides, sequences), batch_size, random_state=random_state, elapsed_epochs=elapsed_epochs, shuffle=shuffle)
def test_gaps(): hg19 = Genome("hg19", chromosomes=["chr1"]) assert "chr1" in hg19 assert "chr2" not in hg19 # Check that no gap is with 0 length gaps = hg19.gaps(["chr1"]) assert (gaps.chromEnd - gaps.chromStart != 0).all() # Converting gaps to sequences: should all be Nns gaps_tesselate = tessellate_bed(gaps, 200, verbose=False) gaps_sequences = hg19.bed_to_sequence(gaps_tesselate) for gap in gaps_sequences: assert set(gap.lower()) == set(["n"]) filled = hg19.filled(["chr1"]) assert (filled.chromEnd - filled.chromStart != 0).all() filled_tesselate = tessellate_bed(filled, 200, verbose=False) filled_sequences = hg19.bed_to_sequence(filled_tesselate) for fl in filled_sequences: assert "n" not in fl.lower() filled_tesselate["strand"] = "." filled_sequences = hg19.bed_to_sequence(filled_tesselate) for fl in filled_sequences: assert "n" not in fl.lower() hg19.delete()
def preprocess_mode_exec(c): logging.basicConfig(format='[%(asctime)s] - %(levelname)s - %(message)s', level=logging.DEBUG) logging.debug("PREPROCESSING MODE") root_path = c['import_path'] saving_path = c['export_path'] cell_lines = c['cell_lines'] window_size = c['window_size'] dataset_type = c['dataset'] if not os.path.exists(root_path): raise FileNotFoundError("Files path not found: {}".format(root_path)) if not os.path.exists(saving_path): logging.debug("{} not found, folder will be created") os.makedirs(saving_path) label_epi_path = get_full_path(root_path, window_size, dataset_type) # Importing regions for enhancers and promoters enhancers_regions, promoters_regions = get_regions(root_path) # Importing and converting labels of enhancers and promoters and join them in a single dataframe full_sequences = get_categorical_labels(label_epi_path) logging.debug("Saving the sequences bed file in {}".format(saving_path)) rows = 0 if c['sample']: sample_size = int(len(full_sequences) * c['sample_perc']) rows = np.random.randint(len(full_sequences), size=sample_size) full_sequences = full_sequences.iloc[rows] full_sequences.to_csv("{}/sequences.bed".format(saving_path), sep="\t", columns=['chrom', 'chromStart', 'chromEnd'], header=False, index=False) logging.debug("Downloading the hg19 genome") chroms = [k for k, _ in full_sequences.groupby(['chrom'])] hg19 = Genome(assembly="hg19", chromosomes=chroms) logging.debug("Downloading the hg19 genome") sequences = hg19.bed_to_sequence(full_sequences) logging.debug("Saving sequences to file...") seqIO_seq = [ creating_seqIO( "{}:{}-{}".format(row['chrom'], row['chromStart'], row['chromEnd']), Seq(row['sequence'].upper())) for _, row in sequences.iterrows() ] save_sequences(saving_path, seqIO_seq) # Importing epigenetic data logging.debug("Importing epigenetic data for: {}".format( ", ".join(cell_lines))) logging.debug( "-------------------------------------------------------------") for l in cell_lines: logging.debug("Importing {} data".format(l)) df_epi_enanchers, df_epi_promoters = get_epigenetic_data( label_epi_path, l) # building type dictionary converting_dictionary = { c: get_type(c) for c in df_epi_promoters.columns } df_epi_enanchers = df_epi_enanchers.astype(converting_dictionary) df_epi_promoters = df_epi_promoters.astype(converting_dictionary) assert len(df_epi_promoters.columns) == len(df_epi_enanchers.columns) logging.debug("number features for {}: {}".format( l, len(df_epi_promoters.columns) - 4)) logging.debug("Number of missing values in enhancers: {}".format( df_epi_enanchers.isna().sum().sum())) logging.debug("Number of missing values in promoters: {}".format( df_epi_promoters.isna().sum().sum())) df_epi_enanchers = fill_missing(df_epi_enanchers, metric="median") df_epi_promoters = fill_missing(df_epi_promoters, metric="median") assert len(enhancers_regions) == len(df_epi_enanchers) logging.debug("Enhancers - regions: {}, epigenetics: {}".format( len(enhancers_regions), len(df_epi_enanchers))) assert len(promoters_regions) == len(df_epi_promoters) logging.debug("Promoters - regions: {}, epigenetics: {}".format( len(promoters_regions), len(df_epi_promoters))) full_epi = append_without_duplicates(df_epi_enanchers, df_epi_promoters) if c['sample']: full_epi = full_epi.iloc[rows] # Check if the data are aligned dataframe are equals before save. assert len(full_sequences) == len(full_epi) assert_frame_equal(full_sequences[['chrom', 'chromStart', 'chromEnd']], full_epi[['chrom', 'chromStart', 'chromEnd']]) logging.debug("Number of total sequences: {}".format( len(full_sequences))) logging.debug("Saving results in {}".format(saving_path)) np.savetxt("{}/{}_epigenetic.txt".format(saving_path, l), full_epi.iloc[:, 4:].values, fmt='%f') np.savetxt("{}/{}_labels.txt".format(saving_path, l), full_sequences[l].values, fmt='%s') logging.debug( "-------------------------------------------------------------")
class GenomeWindowsGenerator: n_types = ["uniform", "normal"] def __init__(self, assembly, window_size, batch_size, buffer_size=None, max_gap_size=100, train_chromosomes=None, val_chromosomes=None, cache_dir=None, lazy_load=True, clear_cache=False, compile_on_start=True, n_type="uniform"): self.assembly, self.window_size = assembly, window_size self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes # Buffersize default None == cpu count for optimal performance: if not buffer_size: buffer_size = cpu_count() self.buffer_size = buffer_size # Validate the type of N if n_type not in self.n_types: raise ValueError("n_type must be one of %s" % n_type) self.n_type = n_type # Get the cache dir cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp" self._cache_directory = "/".join( [cache_dir, assembly, str(window_size)]) if clear_cache: self.clean_cache() # Generate a pool of processes to save the overhead self.workers = max(2, cpu_count()) self.pool = Pool(self.workers) # Preprocess all the possible data self.genome = Genome( assembly=assembly, lazy_load=lazy_load, cache_directory=cache_dir, ) if not val_chromosomes: self.val_chromosomes = [] # If no chromosomes passed then use all the genome if not train_chromosomes: self.chromosomes = sorted(list(self.genome)) else: self.chromosomes = train_chromosomes + self.val_chromosomes self.instance_hash = sha256({ "assembly": self.assembly, "chromosomes": self.chromosomes, "window_size": self.window_size, "max_gap_size": self.max_gap_size, "n_type": n_type, }) if compile_on_start: self.compile() def compile(self): filled = self._filled() windows = self._tasselize_windows(filled, self.window_size) sequences = self._encode_sequences(windows) self._windows_train, self._windows_val = self._train_val_split( sequences) gap_mask = self._render_gaps() self._mean, self._cov = _model_gaps(gap_mask) def _train_val_split(self, sequences): # Get the set of chromosomes # TODO do we need a seed here? # Find the splitting index windows_train = sum([ sequences[chrom].sequence.tolist() for chrom in tqdm( self.chromosomes, desc="Groupping Train windows", leave=False) if chrom not in self.val_chromosomes ], []) windows_val = sum( (sequences[chrom].sequence.tolist() for chrom in tqdm( self.chromosomes, desc="Groupping val windows", leave=False) if chrom in self.val_chromosomes), []) return windows_train, windows_val def steps_per_epoch(self): return len(self._windows_train) // self.batch_size def validation_steps(self): return len(self._windows_val) // self.batch_size @cache_method("{_cache_directory}/{instance_hash}_filled.pkl") def _filled(self): return self.genome.filled(chromosomes=self.chromosomes) @cache_method("{_cache_directory}/{instance_hash}_gap_mask.pkl") def _render_gaps(self): # Compute gaps = self.genome.gaps(chromosomes=self.chromosomes) # Keeping only small gaps gaps = gaps[gaps.chromEnd - gaps.chromStart <= self.max_gap_size] # Expand windows mid_point = ((gaps.chromEnd + gaps.chromStart) / 2).astype(int) gaps.chromStart = (mid_point - self.window_size / 2).astype(int) gaps.chromEnd = (mid_point + self.window_size / 2).astype(int) # Rendering gap sequences gapped_sequences = self.genome.bed_to_sequence(gaps) # Rendering gap mask return np.array([ np.array(list(sequence.lower())) == "n" for sequence in gapped_sequences.sequence ]) @cache_method("{_cache_directory}/{instance_hash}_tasselized.pkl") def _tasselize_windows(self, bed: pd.DataFrame, window_size: int): # Compute tasks = ((row.chrom, row.chromStart, row.chromEnd, window_size) for _, row in bed.iterrows()) return pd.concat( list( tqdm(self.pool.imap(tasselize_window, tasks), total=bed.shape[0], desc="Tasselizing windows", leave=False))) @cache_method("{_cache_directory}/{instance_hash}_sequences.pkl") def _encode_sequences(self, windows): bed = self.genome.bed_to_sequence(windows) return {chrom: data for chrom, data in bed.groupby("chrom")} def batchsize_scheduler(self): while True: yield self.batch_size def _buffer_generator(self, dataset): iterable = _dataset_generator(dataset) for batch_size in self.batchsize_scheduler(): yield [ list(itertools.islice(iterable, batch_size)) for _ in range(self.buffer_size) ] def _buffer_encoder_generator(self, dataset): for buffer in self._buffer_generator(dataset): yield list(self.pool.imap(one_hot_encoder, buffer)) def _generator(self, dataset): for buffer in self._buffer_encoder_generator(dataset): for batch in buffer: yield batch def generator(self): return self._generator(self._windows_train) def validation_data(self): if not self.val_chromosomes: raise ValueError("Can't return the val generator since " "no val chromosomes were specified") return self._generator(self._windows_val) def clean_cache(self): if os.path.exists(self._cache_directory): shutil.rmtree(self._cache_directory) def close(self): if "pool" in vars(self): self.pool.close() self.pool.join()