def test_simulated_download_failure():
    for _ in Genome("sacCer3", chromosomes=sacCer3_chromosomes).items():
        pass
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    path = sacCer3._chromosome_path("chrI")
    with open(path, "w") as f:
        f.write("Totally not JSON")
    with pytest.raises(Exception):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.delete()
def test_create_new_genome_object():
    sacCer3 = Genome(
        "sacCer3",
        chromosomes=sacCer3_chromosomes,
    )
    for path in glob("{path}/*.json".format(path=sacCer3.path)):
        os.remove(path)
    with pytest.warns(RuntimeWarning):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.gaps()
    sacCer3.filled()
    str(sacCer3)
    sacCer3.delete()
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    filled = hg19.filled(chromosomes=["chr1"])
    hg19.bed_to_sequence(filled)
    hg19.delete()
def test_tessellate():
    hg19 = Genome("hg19", chromosomes=["chrM"])
    filled = hg19.filled(chromosomes=["chrM"])
    tessellate_bed(filled, window_size=200, alignment="left")
    tessellate_bed(filled, window_size=200, alignment="right")
    tessellate_bed(filled, window_size=200, alignment="center")
    hg19.delete()
def test_multivariate_gap_center_sequence():
    hg19 = Genome("hg19", chromosomes=["chr1", "chr2", "chr3"])

    _, mean, covariance = get_gaps_statistics(
        hg19,
        100,
        200
    )

    gap_sequence = MultivariateGapCenterSequence(
        assembly=hg19,
        bed=get_test_bed(),
        gaps_mean=mean,
        gaps_covariance=covariance,
        batch_size=32
    )

    x1, y1 = gap_sequence[0]
    x2, y2 = gap_sequence[0]

    assert (x1 == 0.25).any()
    assert set((0.25, 0.0, 1.0)) == set(np.unique(x1))
    assert (x1 == x2).all()
    assert (y1 == y2).all()

    assert x1.shape == (gap_sequence.batch_size, 200, 4)

    cnn_model().fit_generator(
        gap_sequence,
        steps_per_epoch=gap_sequence.steps_per_epoch,
        epochs=2,
        verbose=0,
        shuffle=True
    )
示例#6
0
def get_sequence(epigenomes, region):
    window_size = 200
    genome = Genome('hg19')
    sequences = {
        region: to_dataframe(flat_one_hot_encode(genome, data, window_size),
                             window_size)
        for region, data in epigenomes.items()
    }
    return sequences
示例#7
0
def test_expand_bed_regions():
    hg19 = Genome("hg19", chromosomes=["chr2", "chr3"])
    gaps = hg19.gaps(chromosomes=["chr2", "chr3"])
    gaps = gaps[gaps.chromEnd - gaps.chromStart < 500]
    result = expand_bed_regions(gaps, 200, "left")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "right")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 200, "center")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "center")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 173, "center")
    assert (result.chromEnd - result.chromStart == 173).all()
示例#8
0
def get_data(
    parameters: Tuple[Tuple[str, int, str], str]
) -> Tuple[pd.DataFrame, np.array] or List[np.array, np.array]:
    load_parameters, data_type = parameters
    if data_type == 'epigenomic':
        dataset, labels = load_dataset(load_parameters)
        dataset.reset_index(drop=True, inplace=True)
        return dataset, labels
    if data_type == 'sequences':
        epigenomes, labels = load_dataset(load_parameters)
        genome = Genome('hg19')
        bed = epigenomes.reset_index()[epigenomes.index.names]
        batch_size = len(labels)
        return [
            data for data in MixedSequence(x=BedSequence(
                genome, bed.iloc[np.arange(
                    batch_size)], batch_size=batch_size),
                                           y=labels[np.arange(batch_size)],
                                           batch_size=batch_size)
        ][0]
示例#9
0
def test_wiggle():
    hg19 = Genome("hg19", chromosomes=["chr17"])
    filled = hg19.filled(chromosomes=["chr17"])
    wiggles = wiggle_bed_regions(
        filled,
        max_wiggle_size=100,
        wiggles=10,
        seed=42
    )
    path = "{pwd}/expected_wiggles.csv".format(
        pwd=os.path.dirname(os.path.abspath(__file__))
    )
    if not os.path.exists(path):
        wiggles.to_csv(path, index=False)
    pd.testing.assert_frame_equal(
        wiggles,
        pd.read_csv(path),
        check_dtype=False
    )
    hg19.delete()
示例#10
0
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    # Check that no gap is with 0 length
    gaps = hg19.gaps(["chr1"])
    assert (gaps.chromEnd - gaps.chromStart != 0).all()
    # Converting gaps to sequences: should all be Nns
    gaps_tesselate = tessellate_bed(gaps, 200, verbose=False)
    gaps_sequences = hg19.bed_to_sequence(gaps_tesselate)
    for gap in gaps_sequences:
        assert set(gap.lower()) == set(["n"])
    filled = hg19.filled(["chr1"])
    assert (filled.chromEnd - filled.chromStart != 0).all()
    filled_tesselate = tessellate_bed(filled, 200, verbose=False)
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    filled_tesselate["strand"] = "."
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    hg19.delete()
示例#11
0
def test_genomic_sequence_determinism():
    batch_size = 32
    epochs = 5
    enhancers = pd.read_csv("tests/enhancers.csv")
    promoters = pd.read_csv("tests/promoters.csv")

    genome = Genome("hg19", chromosomes=["chr1"])
    for region in tqdm((enhancers, promoters), desc="Region types"):
        y = np.arange(0, len(region), dtype=np.int64)
        mixed_sequence = MixedSequence(x=BedSequence(genome, region,
                                                     batch_size),
                                       y=VectorSequence(y, batch_size))
        reference_mixed_sequence = MixedSequence(
            x=BedSequence(genome,
                          region,
                          batch_size=len(region),
                          shuffle=False),
            y=VectorSequence(y, batch_size=len(region), shuffle=False))
        X, _ = reference_mixed_sequence[0]
        for _ in trange(epochs, desc="Epochs", leave=False):
            for step in range(mixed_sequence.steps_per_epoch):
                xi, yi = mixed_sequence[step]
                assert (X[yi.astype(int)] == xi).all()
            mixed_sequence.on_epoch_end()
示例#12
0
def get_genome() -> Genome:
    """Download genome or retrieve it if given path"""
    genome = _cache.get('genome') or Genome(
        'hg19', cache_directory=get_default('assembly_path'))
    _cache['genome'] = genome
    return genome
示例#13
0
    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()
def get_genome(assembly):
    return Genome(assembly)
def test_empty_genome():
    with pytest.raises(ValueError):
        Genome("hg19", filters=("", ))
def test_unavailable_genome():
    with pytest.raises(ValueError):
        Genome("hg1")
def get_holdouts(batch_size: int = 128,
                 max_wiggle_size: int = 150,
                 wiggles: int = 10,
                 random_state: int = 42,
                 window_size: int = 500,
                 test_size: float = 0.3,
                 verbose: bool = True,
                 nrows: int = None):
    """Return generator with training and testing holdouts.

    Parameters
    ---------------------------
    batch_size: int = 128,
        The batch size to use.
        Since the task is significantly unbalances, consider using high
        batch sizes.
    max_wiggle_size: int = 150,
        Amount to wiggle the windows.
    wiggles: int = 10,
        Number of wiggles per sample.
    random_state: int = 42,
        Random state to use for reproducibility.
    window_size: int = 500,
        Window size to use.
    test_size: float = 0.3,
        Percentage to leave for the test set.
    verbose: bool = True
        Wethever to show or not the loading bar.
    nrows: int = None,
        Number of rows to read. Useful to test the pipeline.

    Raises
    ----------------------------
    ValueError,
        If given window size if less or equal than the double of given
        maximum wiggle size.

    Returns
    ----------------------------
    Generator with the training holdouts.
    """
    if window_size <= max_wiggle_size * 2:
        raise ValueError(
            ("Given window size {} is less or equal than twice the "
             "given max_wiggle_size {}. This may lead the central SNV "
             "to fall outside the region, hence causing a false positive. "
             "Please either increase the window size or reduce the "
             "maximum wiggle size.").format(window_size, max_wiggle_size))

    # Load the bed file
    bed = pd.read_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "mendelian_snv.csv.gz"),
                      nrows=nrows)

    # Expand (or compress) given bed file windows to required size
    bed = expand_bed_regions(bed, window_size)

    # Load the genomic assembly
    assembly = Genome("hg19", verbose=False)

    # Retrieve set of unique folds
    unique_folds = bed.folds.unique()

    # For each holdout
    for fold in tqdm(
            unique_folds,
            desc="Holdouts",
            disable=not verbose,
    ):
        # Compute the folds mask
        folds_mask = (bed.folds != fold).values
        # We get the training bed partition
        # In this partition, we get all the folds that do not go into the
        # test partition.
        train_bed = bed.iloc[folds_mask]
        # And the testing bed partition
        # In this partition we leave only the single fold that in this iteration
        # of the 10-fold CV we have left out from the train.
        test_bed = bed.iloc[~folds_mask]
        # We wiggle the bed regions the desired amount to generate
        # the required amount of wiggles.
        # We wiggle only the training positives, as wiggling the training
        # negatives might create false negatives.
        positives = train_bed[(train_bed.labels == 1).values]
        # If wiggles are requestes
        if wiggles > 0:
            # Computing the wiggles
            wiggled_train_bed = wiggle_bed_regions(positives, max_wiggle_size,
                                                   wiggles, random_state)
            # Concatenig the training data
            train_bed = pd.concat([wiggled_train_bed, train_bed])
        # Shuffle the training data
        # INFO: This shuffle should not be needed, but just for peace of mind.
        train_bed = train_bed.sample(frac=1, random_state=random_state + fold)
        # Shuffle the test data
        # INFO: This shuffle should not be needed, but just for peace of mind.
        test_bed = test_bed.sample(frac=1, random_state=random_state + fold)
        # And we return the computed training sequences.
        yield (create_sequence(train_bed, assembly, batch_size),
               create_sequence(test_bed, assembly, batch_size))
示例#18
0
def visualize(cell_line, epigenomes, labels):
    genome = Genome("hg19")
    sequences = {
        region: to_dataframe(flat_one_hot_encode(genome, data, 200), 200)
        for region, data in epigenomes.items()
    }
    tasks = {
        "x": [
            *[val.values for val in epigenomes.values()],
            *[val.values for val in sequences.values()]
        ],
        "y": [
            *[val.values.ravel() for val in labels.values()],
            *[val.values.ravel() for val in labels.values()]
        ],
        "titles": [
            "Epigenomes promoters", "Epigenomes enhancers",
            "Sequences promoters", "Sequences enhancers"
        ]
    }

    xs = tasks["x"]
    ys = tasks["y"]
    titles = tasks["titles"]

    assert len(xs) == len(ys) == len(titles)

    for x, y in zip(xs, ys):
        assert x.shape[0] == y.shape[0]
    print("test")
    colors = np.array([
        "tab:blue",
        "tab:orange",
    ])

    fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(32, 8))

    for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()),
                                  desc="Computing PCAs",
                                  total=len(xs)):
        axis.scatter(*pca(x).T, s=1, color=colors[y])
        axis.xaxis.set_visible(False)
        axis.yaxis.set_visible(False)
        axis.set_title(f"PCA decomposition - {title}")
    plt.savefig("./imgs/" + cell_line + "/PCA decomposition")
    plt.show()

    for perpexity in tqdm((50, 500), desc="Running perplexities"):
        fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(40, 10))
        for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()),
                                      desc="Computing TSNEs",
                                      total=len(xs)):
            axis.scatter(*ulyanov_tsne(x, perplexity=perpexity).T,
                         s=1,
                         color=colors[y])
            axis.xaxis.set_visible(False)
            axis.yaxis.set_visible(False)
            axis.set_title(f"TSNE decomposition - {title}")
        fig.tight_layout()
        fig.savefig("./imgs/" + cell_line + f"/TSNE_" + str(perpexity))
        plt.show()
示例#19
0
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    splits = 11
    holdouts = StratifiedShuffleSplit(
        n_splits=splits, test_size=0.2, random_state=42)
    genome = Genome("hg19")
    bed = to_bed(epigenomes[region_type])
    labels = nlabels[region_type].values.ravel()
    if os.path.exists(cell_line + "_" + region_type + "_sequence.json"):
        results = compress_json.local_load(
            cell_line + "_" + region_type + "_sequence.json")
    else:
        results = []
    class_w = class_weight.compute_class_weight(
        'balanced', np.unique(labels), labels)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
        train, test = get_holdout(
            train_index, test_index, bed, labels, genome, 1024)
        print("="*80)
        for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(
                train,
                steps_per_epoch=train.steps_per_epoch,
                validation_data=test,
                validation_steps=test.steps_per_epoch,
                epochs=1000,
                shuffle=True,
                verbose=False,
                class_weight=class_w,
                callbacks=[
                    EarlyStopping(monitor="val_loss", mode="min",
                                  patience=50, restore_best_weights=True),
                ]
            ).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items()
                    if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items()
                    if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_sequence.json")
            df = pd.DataFrame(results).drop(columns="holdout")
    return df
示例#20
0
def preprocess_mode_exec(c):
    logging.basicConfig(format='[%(asctime)s] - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    logging.debug("PREPROCESSING MODE")

    root_path = c['import_path']
    saving_path = c['export_path']
    cell_lines = c['cell_lines']
    window_size = c['window_size']
    dataset_type = c['dataset']

    if not os.path.exists(root_path):
        raise FileNotFoundError("Files path not found: {}".format(root_path))

    if not os.path.exists(saving_path):
        logging.debug("{} not found, folder will be created")
        os.makedirs(saving_path)

    label_epi_path = get_full_path(root_path, window_size, dataset_type)

    # Importing regions for enhancers and promoters
    enhancers_regions, promoters_regions = get_regions(root_path)

    # Importing and converting labels of enhancers and promoters and join them in a single dataframe
    full_sequences = get_categorical_labels(label_epi_path)
    logging.debug("Saving the sequences bed file in {}".format(saving_path))

    rows = 0
    if c['sample']:
        sample_size = int(len(full_sequences) * c['sample_perc'])
        rows = np.random.randint(len(full_sequences), size=sample_size)
        full_sequences = full_sequences.iloc[rows]
    full_sequences.to_csv("{}/sequences.bed".format(saving_path),
                          sep="\t",
                          columns=['chrom', 'chromStart', 'chromEnd'],
                          header=False,
                          index=False)

    logging.debug("Downloading the hg19 genome")
    chroms = [k for k, _ in full_sequences.groupby(['chrom'])]
    hg19 = Genome(assembly="hg19", chromosomes=chroms)
    logging.debug("Downloading the hg19 genome")
    sequences = hg19.bed_to_sequence(full_sequences)

    logging.debug("Saving sequences to file...")
    seqIO_seq = [
        creating_seqIO(
            "{}:{}-{}".format(row['chrom'],
                              row['chromStart'], row['chromEnd']),
            Seq(row['sequence'].upper())) for _, row in sequences.iterrows()
    ]
    save_sequences(saving_path, seqIO_seq)

    # Importing epigenetic data
    logging.debug("Importing epigenetic data for: {}".format(
        ", ".join(cell_lines)))
    logging.debug(
        "-------------------------------------------------------------")
    for l in cell_lines:
        logging.debug("Importing {} data".format(l))

        df_epi_enanchers, df_epi_promoters = get_epigenetic_data(
            label_epi_path, l)

        # building type dictionary
        converting_dictionary = {
            c: get_type(c)
            for c in df_epi_promoters.columns
        }
        df_epi_enanchers = df_epi_enanchers.astype(converting_dictionary)
        df_epi_promoters = df_epi_promoters.astype(converting_dictionary)

        assert len(df_epi_promoters.columns) == len(df_epi_enanchers.columns)
        logging.debug("number features for {}: {}".format(
            l,
            len(df_epi_promoters.columns) - 4))
        logging.debug("Number of missing values in enhancers: {}".format(
            df_epi_enanchers.isna().sum().sum()))
        logging.debug("Number of missing values in promoters: {}".format(
            df_epi_promoters.isna().sum().sum()))

        df_epi_enanchers = fill_missing(df_epi_enanchers, metric="median")
        df_epi_promoters = fill_missing(df_epi_promoters, metric="median")

        assert len(enhancers_regions) == len(df_epi_enanchers)
        logging.debug("Enhancers - regions: {}, epigenetics: {}".format(
            len(enhancers_regions), len(df_epi_enanchers)))

        assert len(promoters_regions) == len(df_epi_promoters)
        logging.debug("Promoters - regions: {}, epigenetics: {}".format(
            len(promoters_regions), len(df_epi_promoters)))

        full_epi = append_without_duplicates(df_epi_enanchers,
                                             df_epi_promoters)
        if c['sample']:
            full_epi = full_epi.iloc[rows]
        # Check if the data are aligned dataframe are equals before save.
        assert len(full_sequences) == len(full_epi)
        assert_frame_equal(full_sequences[['chrom', 'chromStart', 'chromEnd']],
                           full_epi[['chrom', 'chromStart', 'chromEnd']])
        logging.debug("Number of total sequences: {}".format(
            len(full_sequences)))

        logging.debug("Saving results in {}".format(saving_path))
        np.savetxt("{}/{}_epigenetic.txt".format(saving_path, l),
                   full_epi.iloc[:, 4:].values,
                   fmt='%f')
        np.savetxt("{}/{}_labels.txt".format(saving_path, l),
                   full_sequences[l].values,
                   fmt='%s')

        logging.debug(
            "-------------------------------------------------------------")
示例#21
0
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from tqdm import tqdm
tqdm.pandas()
import matplotlib.colors
import scipy.stats
from ucsc_genomes_downloader import Genome
hg38 = Genome(assembly="hg38")
import os
import pysam
import argparse

parser = argparse.ArgumentParser(
    description='Process histograms, scatter plots and metaplots')

parser.add_argument('cell_type')
parser.add_argument('tabix_file')
parser.add_argument('fragments')

args = parser.parse_args()
cell_type = args.cell_type

tabix_file = pysam.TabixFile(args.tabix_file)

os.system(
    'gunzip -c {} | bedtools intersect -sorted -c -a /home/John/JohnProject/reference/DHS_adjusted_6mer_bias_adjustedby_30_sorted_no_blacklist.unique.bed -b - > {}/index_cuts_{}_intersect.bed'