def test_tessellate():
    hg19 = Genome("hg19", chromosomes=["chrM"])
    filled = hg19.filled(chromosomes=["chrM"])
    tessellate_bed(filled, window_size=200, alignment="left")
    tessellate_bed(filled, window_size=200, alignment="right")
    tessellate_bed(filled, window_size=200, alignment="center")
    hg19.delete()
Пример #2
0
def get_gaps_statistics(
        genome: Genome, max_gap_size: int,
        window_size: int) -> Tuple[int, np.ndarray, np.ndarray]:
    """Return number, mean and covariance of gaps.

    Parameters
    --------------------------
    genome:Genome,
        The genome to use.
    max_gap_size:int,
        The maximum gap size to take in consideration.
    window_size:int
        The target window size

    Returns
    --------------------------
    Returns Tuple containing number of gaps, mean and covariance.
    """
    # Obtaining gaps
    gaps = genome.gaps()
    # Getting gaps whose size is below given threshold
    gaps = gaps[gaps.chromEnd - gaps.chromStart < max_gap_size]
    # Expanding gaps to given window size
    gaps = expand_bed_regions(gaps, window_size, alignment="center")
    # Retrieving the sequences corresponding to given gaps
    sequences = genome.bed_to_sequence(gaps).sequence.str.lower()
    # Obtaining a mask of gaps
    gaps_mask = np.array([list(sequence) for sequence in sequences]) == "n"
    number = len(gaps_mask)
    mean = gaps_mask.mean(axis=0)
    covariance = np.cov(gaps_mask.T)
    return number, mean, covariance
Пример #3
0
def test_expand_bed_regions():
    hg19 = Genome("hg19", chromosomes=["chr2", "chr3"])
    gaps = hg19.gaps(chromosomes=["chr2", "chr3"])
    gaps = gaps[gaps.chromEnd - gaps.chromStart < 500]
    result = expand_bed_regions(gaps, 200, "left")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "right")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 200, "center")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "center")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 173, "center")
    assert (result.chromEnd - result.chromStart == 173).all()
def test_multivariate_gap_center_sequence():
    hg19 = Genome("hg19", chromosomes=["chr1", "chr2", "chr3"])

    _, mean, covariance = get_gaps_statistics(
        hg19,
        100,
        200
    )

    gap_sequence = MultivariateGapCenterSequence(
        assembly=hg19,
        bed=get_test_bed(),
        gaps_mean=mean,
        gaps_covariance=covariance,
        batch_size=32
    )

    x1, y1 = gap_sequence[0]
    x2, y2 = gap_sequence[0]

    assert (x1 == 0.25).any()
    assert set((0.25, 0.0, 1.0)) == set(np.unique(x1))
    assert (x1 == x2).all()
    assert (y1 == y2).all()

    assert x1.shape == (gap_sequence.batch_size, 200, 4)

    cnn_model().fit_generator(
        gap_sequence,
        steps_per_epoch=gap_sequence.steps_per_epoch,
        epochs=2,
        verbose=0,
        shuffle=True
    )
def test_simulated_download_failure():
    for _ in Genome("sacCer3", chromosomes=sacCer3_chromosomes).items():
        pass
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    path = sacCer3._chromosome_path("chrI")
    with open(path, "w") as f:
        f.write("Totally not JSON")
    with pytest.raises(Exception):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.delete()
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    filled = hg19.filled(chromosomes=["chr1"])
    hg19.bed_to_sequence(filled)
    hg19.delete()
Пример #7
0
def get_sequence(epigenomes, region):
    window_size = 200
    genome = Genome('hg19')
    sequences = {
        region: to_dataframe(flat_one_hot_encode(genome, data, window_size),
                             window_size)
        for region, data in epigenomes.items()
    }
    return sequences
Пример #8
0
    def __init__(self,
                 genome: Genome,
                 bed: pd.DataFrame,
                 batch_size: int,
                 nucleotides: str = "actg",
                 unknown_nucleotide_value: float = 0.25,
                 random_state: int = 42,
                 elapsed_epochs: int = 0,
                 shuffle: bool = True):
        """Return new BedSequence object.

        Parameters
        --------------------
        genome: Genome,
            Genomic assembly from ucsc from which to extract sequences.
        bed: pd.DataFrame,
            Pandas DataFrame containing minimal bed columns,
            like "chrom", "chromStart" and "chromEnd".
        batch_size: int,
            Batch size to be returned for each request.
        nucleotides: str = "actg",
            Nucleotides to consider when one-hot encoding.
        unknown_nucleotide_value: float = 0.25,
            The default value to use for encoding unknown nucleotides.
        random_state: int = 42,
            Starting random_state to use if shuffling the dataset.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        shuffle: bool = True,
            Wethever to shuffle or not the sequence.

        Raises
        --------------------
        ValueError:
            If the bed file regions does not have the same length.
        """
        # Every window in the bed file must be
        # of the same length.
        if len(set((bed.chromEnd - bed.chromStart).values)) != 1:
            raise ValueError("The bed file regions must have the same length!")

        self._window_length = (bed.chromEnd - bed.chromStart).values[0]
        self._nucleotides = nucleotides
        self._nucleotides_number = len(nucleotides)
        self._unknown_nucleotide_value = unknown_nucleotide_value

        # We extract the sequences of the bed file from
        # the given genome.
        sequences = np.array(genome.bed_to_sequence(bed), dtype=str)

        super().__init__(nucleotides_to_numbers(self.nucleotides, sequences),
                         batch_size,
                         random_state=random_state,
                         elapsed_epochs=elapsed_epochs,
                         shuffle=shuffle)
Пример #9
0
def test_wiggle():
    hg19 = Genome("hg19", chromosomes=["chr17"])
    filled = hg19.filled(chromosomes=["chr17"])
    wiggles = wiggle_bed_regions(
        filled,
        max_wiggle_size=100,
        wiggles=10,
        seed=42
    )
    path = "{pwd}/expected_wiggles.csv".format(
        pwd=os.path.dirname(os.path.abspath(__file__))
    )
    if not os.path.exists(path):
        wiggles.to_csv(path, index=False)
    pd.testing.assert_frame_equal(
        wiggles,
        pd.read_csv(path),
        check_dtype=False
    )
    hg19.delete()
Пример #10
0
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    # Check that no gap is with 0 length
    gaps = hg19.gaps(["chr1"])
    assert (gaps.chromEnd - gaps.chromStart != 0).all()
    # Converting gaps to sequences: should all be Nns
    gaps_tesselate = tessellate_bed(gaps, 200, verbose=False)
    gaps_sequences = hg19.bed_to_sequence(gaps_tesselate)
    for gap in gaps_sequences:
        assert set(gap.lower()) == set(["n"])
    filled = hg19.filled(["chr1"])
    assert (filled.chromEnd - filled.chromStart != 0).all()
    filled_tesselate = tessellate_bed(filled, 200, verbose=False)
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    filled_tesselate["strand"] = "."
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    hg19.delete()
Пример #11
0
def get_data(
    parameters: Tuple[Tuple[str, int, str], str]
) -> Tuple[pd.DataFrame, np.array] or List[np.array, np.array]:
    load_parameters, data_type = parameters
    if data_type == 'epigenomic':
        dataset, labels = load_dataset(load_parameters)
        dataset.reset_index(drop=True, inplace=True)
        return dataset, labels
    if data_type == 'sequences':
        epigenomes, labels = load_dataset(load_parameters)
        genome = Genome('hg19')
        bed = epigenomes.reset_index()[epigenomes.index.names]
        batch_size = len(labels)
        return [
            data for data in MixedSequence(x=BedSequence(
                genome, bed.iloc[np.arange(
                    batch_size)], batch_size=batch_size),
                                           y=labels[np.arange(batch_size)],
                                           batch_size=batch_size)
        ][0]
Пример #12
0
def test_genomic_sequence_determinism():
    batch_size = 32
    epochs = 5
    enhancers = pd.read_csv("tests/enhancers.csv")
    promoters = pd.read_csv("tests/promoters.csv")

    genome = Genome("hg19", chromosomes=["chr1"])
    for region in tqdm((enhancers, promoters), desc="Region types"):
        y = np.arange(0, len(region), dtype=np.int64)
        mixed_sequence = MixedSequence(x=BedSequence(genome, region,
                                                     batch_size),
                                       y=VectorSequence(y, batch_size))
        reference_mixed_sequence = MixedSequence(
            x=BedSequence(genome,
                          region,
                          batch_size=len(region),
                          shuffle=False),
            y=VectorSequence(y, batch_size=len(region), shuffle=False))
        X, _ = reference_mixed_sequence[0]
        for _ in trange(epochs, desc="Epochs", leave=False):
            for step in range(mixed_sequence.steps_per_epoch):
                xi, yi = mixed_sequence[step]
                assert (X[yi.astype(int)] == xi).all()
            mixed_sequence.on_epoch_end()
Пример #13
0
def get_genome() -> Genome:
    """Download genome or retrieve it if given path"""
    genome = _cache.get('genome') or Genome(
        'hg19', cache_directory=get_default('assembly_path'))
    _cache['genome'] = genome
    return genome
def test_empty_genome():
    with pytest.raises(ValueError):
        Genome("hg19", filters=("", ))
def test_create_new_genome_object():
    sacCer3 = Genome(
        "sacCer3",
        chromosomes=sacCer3_chromosomes,
    )
    for path in glob("{path}/*.json".format(path=sacCer3.path)):
        os.remove(path)
    with pytest.warns(RuntimeWarning):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.gaps()
    sacCer3.filled()
    str(sacCer3)
    sacCer3.delete()
def get_genome(assembly):
    return Genome(assembly)
def test_unavailable_genome():
    with pytest.raises(ValueError):
        Genome("hg1")
Пример #18
0
def visualize(cell_line, epigenomes, labels):
    genome = Genome("hg19")
    sequences = {
        region: to_dataframe(flat_one_hot_encode(genome, data, 200), 200)
        for region, data in epigenomes.items()
    }
    tasks = {
        "x": [
            *[val.values for val in epigenomes.values()],
            *[val.values for val in sequences.values()]
        ],
        "y": [
            *[val.values.ravel() for val in labels.values()],
            *[val.values.ravel() for val in labels.values()]
        ],
        "titles": [
            "Epigenomes promoters", "Epigenomes enhancers",
            "Sequences promoters", "Sequences enhancers"
        ]
    }

    xs = tasks["x"]
    ys = tasks["y"]
    titles = tasks["titles"]

    assert len(xs) == len(ys) == len(titles)

    for x, y in zip(xs, ys):
        assert x.shape[0] == y.shape[0]
    print("test")
    colors = np.array([
        "tab:blue",
        "tab:orange",
    ])

    fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(32, 8))

    for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()),
                                  desc="Computing PCAs",
                                  total=len(xs)):
        axis.scatter(*pca(x).T, s=1, color=colors[y])
        axis.xaxis.set_visible(False)
        axis.yaxis.set_visible(False)
        axis.set_title(f"PCA decomposition - {title}")
    plt.savefig("./imgs/" + cell_line + "/PCA decomposition")
    plt.show()

    for perpexity in tqdm((50, 500), desc="Running perplexities"):
        fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(40, 10))
        for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()),
                                      desc="Computing TSNEs",
                                      total=len(xs)):
            axis.scatter(*ulyanov_tsne(x, perplexity=perpexity).T,
                         s=1,
                         color=colors[y])
            axis.xaxis.set_visible(False)
            axis.yaxis.set_visible(False)
            axis.set_title(f"TSNE decomposition - {title}")
        fig.tight_layout()
        fig.savefig("./imgs/" + cell_line + f"/TSNE_" + str(perpexity))
        plt.show()
Пример #19
0
class GenomeWindowsGenerator:

    n_types = ["uniform", "normal"]

    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()

    def compile(self):
        filled = self._filled()
        windows = self._tasselize_windows(filled, self.window_size)
        sequences = self._encode_sequences(windows)

        self._windows_train, self._windows_val = self._train_val_split(
            sequences)

        gap_mask = self._render_gaps()
        self._mean, self._cov = _model_gaps(gap_mask)

    def _train_val_split(self, sequences):
        # Get the set of chromosomes
        # TODO do we need a seed here?
        # Find the splitting index
        windows_train = sum([
            sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping Train windows", leave=False)
            if chrom not in self.val_chromosomes
        ], [])
        windows_val = sum(
            (sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping val windows", leave=False)
             if chrom in self.val_chromosomes), [])
        return windows_train, windows_val

    def steps_per_epoch(self):
        return len(self._windows_train) // self.batch_size

    def validation_steps(self):
        return len(self._windows_val) // self.batch_size

    @cache_method("{_cache_directory}/{instance_hash}_filled.pkl")
    def _filled(self):
        return self.genome.filled(chromosomes=self.chromosomes)

    @cache_method("{_cache_directory}/{instance_hash}_gap_mask.pkl")
    def _render_gaps(self):
        # Compute
        gaps = self.genome.gaps(chromosomes=self.chromosomes)
        # Keeping only small gaps
        gaps = gaps[gaps.chromEnd - gaps.chromStart <= self.max_gap_size]
        # Expand windows
        mid_point = ((gaps.chromEnd + gaps.chromStart) / 2).astype(int)
        gaps.chromStart = (mid_point - self.window_size / 2).astype(int)
        gaps.chromEnd = (mid_point + self.window_size / 2).astype(int)
        # Rendering gap sequences
        gapped_sequences = self.genome.bed_to_sequence(gaps)
        # Rendering gap mask
        return np.array([
            np.array(list(sequence.lower())) == "n"
            for sequence in gapped_sequences.sequence
        ])

    @cache_method("{_cache_directory}/{instance_hash}_tasselized.pkl")
    def _tasselize_windows(self, bed: pd.DataFrame, window_size: int):
        # Compute
        tasks = ((row.chrom, row.chromStart, row.chromEnd, window_size)
                 for _, row in bed.iterrows())
        return pd.concat(
            list(
                tqdm(self.pool.imap(tasselize_window, tasks),
                     total=bed.shape[0],
                     desc="Tasselizing windows",
                     leave=False)))

    @cache_method("{_cache_directory}/{instance_hash}_sequences.pkl")
    def _encode_sequences(self, windows):
        bed = self.genome.bed_to_sequence(windows)
        return {chrom: data for chrom, data in bed.groupby("chrom")}

    def batchsize_scheduler(self):
        while True:
            yield self.batch_size

    def _buffer_generator(self, dataset):
        iterable = _dataset_generator(dataset)
        for batch_size in self.batchsize_scheduler():
            yield [
                list(itertools.islice(iterable, batch_size))
                for _ in range(self.buffer_size)
            ]

    def _buffer_encoder_generator(self, dataset):
        for buffer in self._buffer_generator(dataset):
            yield list(self.pool.imap(one_hot_encoder, buffer))

    def _generator(self, dataset):
        for buffer in self._buffer_encoder_generator(dataset):
            for batch in buffer:
                yield batch

    def generator(self):
        return self._generator(self._windows_train)

    def validation_data(self):
        if not self.val_chromosomes:
            raise ValueError("Can't return the val generator since "
                             "no val chromosomes were specified")
        return self._generator(self._windows_val)

    def clean_cache(self):
        if os.path.exists(self._cache_directory):
            shutil.rmtree(self._cache_directory)

    def close(self):
        if "pool" in vars(self):
            self.pool.close()
            self.pool.join()
Пример #20
0
    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()
def get_holdouts(batch_size: int = 128,
                 max_wiggle_size: int = 150,
                 wiggles: int = 10,
                 random_state: int = 42,
                 window_size: int = 500,
                 test_size: float = 0.3,
                 verbose: bool = True,
                 nrows: int = None):
    """Return generator with training and testing holdouts.

    Parameters
    ---------------------------
    batch_size: int = 128,
        The batch size to use.
        Since the task is significantly unbalances, consider using high
        batch sizes.
    max_wiggle_size: int = 150,
        Amount to wiggle the windows.
    wiggles: int = 10,
        Number of wiggles per sample.
    random_state: int = 42,
        Random state to use for reproducibility.
    window_size: int = 500,
        Window size to use.
    test_size: float = 0.3,
        Percentage to leave for the test set.
    verbose: bool = True
        Wethever to show or not the loading bar.
    nrows: int = None,
        Number of rows to read. Useful to test the pipeline.

    Raises
    ----------------------------
    ValueError,
        If given window size if less or equal than the double of given
        maximum wiggle size.

    Returns
    ----------------------------
    Generator with the training holdouts.
    """
    if window_size <= max_wiggle_size * 2:
        raise ValueError(
            ("Given window size {} is less or equal than twice the "
             "given max_wiggle_size {}. This may lead the central SNV "
             "to fall outside the region, hence causing a false positive. "
             "Please either increase the window size or reduce the "
             "maximum wiggle size.").format(window_size, max_wiggle_size))

    # Load the bed file
    bed = pd.read_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "mendelian_snv.csv.gz"),
                      nrows=nrows)

    # Expand (or compress) given bed file windows to required size
    bed = expand_bed_regions(bed, window_size)

    # Load the genomic assembly
    assembly = Genome("hg19", verbose=False)

    # Retrieve set of unique folds
    unique_folds = bed.folds.unique()

    # For each holdout
    for fold in tqdm(
            unique_folds,
            desc="Holdouts",
            disable=not verbose,
    ):
        # Compute the folds mask
        folds_mask = (bed.folds != fold).values
        # We get the training bed partition
        # In this partition, we get all the folds that do not go into the
        # test partition.
        train_bed = bed.iloc[folds_mask]
        # And the testing bed partition
        # In this partition we leave only the single fold that in this iteration
        # of the 10-fold CV we have left out from the train.
        test_bed = bed.iloc[~folds_mask]
        # We wiggle the bed regions the desired amount to generate
        # the required amount of wiggles.
        # We wiggle only the training positives, as wiggling the training
        # negatives might create false negatives.
        positives = train_bed[(train_bed.labels == 1).values]
        # If wiggles are requestes
        if wiggles > 0:
            # Computing the wiggles
            wiggled_train_bed = wiggle_bed_regions(positives, max_wiggle_size,
                                                   wiggles, random_state)
            # Concatenig the training data
            train_bed = pd.concat([wiggled_train_bed, train_bed])
        # Shuffle the training data
        # INFO: This shuffle should not be needed, but just for peace of mind.
        train_bed = train_bed.sample(frac=1, random_state=random_state + fold)
        # Shuffle the test data
        # INFO: This shuffle should not be needed, but just for peace of mind.
        test_bed = test_bed.sample(frac=1, random_state=random_state + fold)
        # And we return the computed training sequences.
        yield (create_sequence(train_bed, assembly, batch_size),
               create_sequence(test_bed, assembly, batch_size))
Пример #22
0
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    splits = 11
    holdouts = StratifiedShuffleSplit(
        n_splits=splits, test_size=0.2, random_state=42)
    genome = Genome("hg19")
    bed = to_bed(epigenomes[region_type])
    labels = nlabels[region_type].values.ravel()
    if os.path.exists(cell_line + "_" + region_type + "_sequence.json"):
        results = compress_json.local_load(
            cell_line + "_" + region_type + "_sequence.json")
    else:
        results = []
    class_w = class_weight.compute_class_weight(
        'balanced', np.unique(labels), labels)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
        train, test = get_holdout(
            train_index, test_index, bed, labels, genome, 1024)
        print("="*80)
        for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(
                train,
                steps_per_epoch=train.steps_per_epoch,
                validation_data=test,
                validation_steps=test.steps_per_epoch,
                epochs=1000,
                shuffle=True,
                verbose=False,
                class_weight=class_w,
                callbacks=[
                    EarlyStopping(monitor="val_loss", mode="min",
                                  patience=50, restore_best_weights=True),
                ]
            ).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items()
                    if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items()
                    if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_sequence.json")
            df = pd.DataFrame(results).drop(columns="holdout")
    return df
Пример #23
0
def preprocess_mode_exec(c):
    logging.basicConfig(format='[%(asctime)s] - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    logging.debug("PREPROCESSING MODE")

    root_path = c['import_path']
    saving_path = c['export_path']
    cell_lines = c['cell_lines']
    window_size = c['window_size']
    dataset_type = c['dataset']

    if not os.path.exists(root_path):
        raise FileNotFoundError("Files path not found: {}".format(root_path))

    if not os.path.exists(saving_path):
        logging.debug("{} not found, folder will be created")
        os.makedirs(saving_path)

    label_epi_path = get_full_path(root_path, window_size, dataset_type)

    # Importing regions for enhancers and promoters
    enhancers_regions, promoters_regions = get_regions(root_path)

    # Importing and converting labels of enhancers and promoters and join them in a single dataframe
    full_sequences = get_categorical_labels(label_epi_path)
    logging.debug("Saving the sequences bed file in {}".format(saving_path))

    rows = 0
    if c['sample']:
        sample_size = int(len(full_sequences) * c['sample_perc'])
        rows = np.random.randint(len(full_sequences), size=sample_size)
        full_sequences = full_sequences.iloc[rows]
    full_sequences.to_csv("{}/sequences.bed".format(saving_path),
                          sep="\t",
                          columns=['chrom', 'chromStart', 'chromEnd'],
                          header=False,
                          index=False)

    logging.debug("Downloading the hg19 genome")
    chroms = [k for k, _ in full_sequences.groupby(['chrom'])]
    hg19 = Genome(assembly="hg19", chromosomes=chroms)
    logging.debug("Downloading the hg19 genome")
    sequences = hg19.bed_to_sequence(full_sequences)

    logging.debug("Saving sequences to file...")
    seqIO_seq = [
        creating_seqIO(
            "{}:{}-{}".format(row['chrom'],
                              row['chromStart'], row['chromEnd']),
            Seq(row['sequence'].upper())) for _, row in sequences.iterrows()
    ]
    save_sequences(saving_path, seqIO_seq)

    # Importing epigenetic data
    logging.debug("Importing epigenetic data for: {}".format(
        ", ".join(cell_lines)))
    logging.debug(
        "-------------------------------------------------------------")
    for l in cell_lines:
        logging.debug("Importing {} data".format(l))

        df_epi_enanchers, df_epi_promoters = get_epigenetic_data(
            label_epi_path, l)

        # building type dictionary
        converting_dictionary = {
            c: get_type(c)
            for c in df_epi_promoters.columns
        }
        df_epi_enanchers = df_epi_enanchers.astype(converting_dictionary)
        df_epi_promoters = df_epi_promoters.astype(converting_dictionary)

        assert len(df_epi_promoters.columns) == len(df_epi_enanchers.columns)
        logging.debug("number features for {}: {}".format(
            l,
            len(df_epi_promoters.columns) - 4))
        logging.debug("Number of missing values in enhancers: {}".format(
            df_epi_enanchers.isna().sum().sum()))
        logging.debug("Number of missing values in promoters: {}".format(
            df_epi_promoters.isna().sum().sum()))

        df_epi_enanchers = fill_missing(df_epi_enanchers, metric="median")
        df_epi_promoters = fill_missing(df_epi_promoters, metric="median")

        assert len(enhancers_regions) == len(df_epi_enanchers)
        logging.debug("Enhancers - regions: {}, epigenetics: {}".format(
            len(enhancers_regions), len(df_epi_enanchers)))

        assert len(promoters_regions) == len(df_epi_promoters)
        logging.debug("Promoters - regions: {}, epigenetics: {}".format(
            len(promoters_regions), len(df_epi_promoters)))

        full_epi = append_without_duplicates(df_epi_enanchers,
                                             df_epi_promoters)
        if c['sample']:
            full_epi = full_epi.iloc[rows]
        # Check if the data are aligned dataframe are equals before save.
        assert len(full_sequences) == len(full_epi)
        assert_frame_equal(full_sequences[['chrom', 'chromStart', 'chromEnd']],
                           full_epi[['chrom', 'chromStart', 'chromEnd']])
        logging.debug("Number of total sequences: {}".format(
            len(full_sequences)))

        logging.debug("Saving results in {}".format(saving_path))
        np.savetxt("{}/{}_epigenetic.txt".format(saving_path, l),
                   full_epi.iloc[:, 4:].values,
                   fmt='%f')
        np.savetxt("{}/{}_labels.txt".format(saving_path, l),
                   full_sequences[l].values,
                   fmt='%s')

        logging.debug(
            "-------------------------------------------------------------")
Пример #24
0
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from tqdm import tqdm
tqdm.pandas()
import matplotlib.colors
import scipy.stats
from ucsc_genomes_downloader import Genome
hg38 = Genome(assembly="hg38")
import os
import pysam
import argparse

parser = argparse.ArgumentParser(
    description='Process histograms, scatter plots and metaplots')

parser.add_argument('cell_type')
parser.add_argument('tabix_file')
parser.add_argument('fragments')

args = parser.parse_args()
cell_type = args.cell_type

tabix_file = pysam.TabixFile(args.tabix_file)

os.system(
    'gunzip -c {} | bedtools intersect -sorted -c -a /home/John/JohnProject/reference/DHS_adjusted_6mer_bias_adjustedby_30_sorted_no_blacklist.unique.bed -b - > {}/index_cuts_{}_intersect.bed'