Python fetch示例，bbi.fetch Python示例

示例#1

0

显示文件

文件： test_bbi.py 项目： snikumbh/pybbi

def test_fetch_remote(path, url):
    x_local = bbi.fetch(BW_FILE, 'chr21', 0, 100)
    x_remote = bbi.fetch(BW_URL, 'chr21', 0, 100)
    assert np.allclose(x_local, x_remote, equal_nan=True)

    x_local = bbi.fetch(BB_FILE, 'chr21', 0, 100)
    x_remote = bbi.fetch(BB_URL, 'chr21', 0, 100)
    assert np.allclose(x_local, x_remote, equal_nan=True)

示例#2

0

显示文件

文件： test_bbi.py 项目： snikumbh/pybbi

def test_fetch(path):
    x = bbi.fetch(path, 'chr21', 0, 1000)
    assert len(x) == 1000

    x = bbi.fetch(path, 'chr21', 0, 1000, bins=10)
    assert len(x) == 10

    with pytest.raises(KeyError):
        bbi.fetch(path, 'chr1', 0, 1000)

示例#3

0

显示文件

文件： test_bbi.py 项目： snikumbh/pybbi

def test_fetch_oob(path):
    x = bbi.fetch(path, 'chr21', -10, 1000, oob=np.nan)
    assert np.all(np.isnan(x[:10]))
    x = bbi.fetch(path, 'chr21', -10, 1000, oob=0)
    assert np.all(x[:10] == 0)

    n = bbi.chromsizes(path)['chr21']
    x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=np.nan)
    assert np.all(np.isnan(x[-10:]))
    x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=0)
    assert np.all(x[-10:] == 0)

示例#4

0

显示文件

文件： data.py 项目： refactor-droidyy/ensembl-2020-server

def get_bigwig_data(path, chrom, start, end, points):
    if os.path.exists(path):
        try:
            return bbi.fetch(path, chrom, start, end, bins=points)
        except (KeyError, OverflowError):
            pass
    return []

示例#5

0

显示文件

文件： bigwig_tiles.py 项目： hms-dbmi/cistrome-explorer-higlass-server

def get_bigwig_tile(bwpath, zoom_level, start_pos, end_pos):
    chromsizes = get_chromsizes(bwpath)
    resolutions = get_zoom_resolutions(chromsizes)
    binsize = resolutions[zoom_level]

    arrays = []
    for cid, start, end in abs2genomic(chromsizes, start_pos, end_pos):
        n_bins = int(np.ceil((end - start) / binsize))
        try:
            chrom = chromsizes.index[cid]
            clen = chromsizes.values[cid]

            x = bbi.fetch(bwpath,
                          chrom,
                          start,
                          end,
                          bins=n_bins,
                          missing=np.nan)

            # drop the very last bin if it is smaller than the binsize
            if end == clen and clen % binsize != 0:
                x = x[:-1]
        except IndexError:
            # beyond the range of the available chromosomes
            # probably means we've requested a range of absolute
            # coordinates that stretch beyond the end of the genome
            x = np.zeros(n_bins)

        arrays.append(x)

    return np.concatenate(arrays)

示例#6

0

显示文件

文件： bigwig.py 项目： mildewey/clodius

def fetch_data(a):
    (bwpath, binsize, chromsizes, cid, start, end) = a
    n_bins = int(np.ceil((end - start) / binsize))
    try:
        chrom = chromsizes.index[cid]
        clen = chromsizes.values[cid]

        t1 = time.time()
        #print("fetching:", chrom, start, end, n_bins);
        x = bbi.fetch(bwpath, chrom, start, end,
                      bins=n_bins, missing=np.nan)
        t2 = time.time()

        # drop the very last bin if it is smaller than the binsize
        if end == clen and clen % binsize != 0:
            x = x[:-1]
    except IndexError:
        # beyond the range of the available chromosomes
        # probably means we've requested a range of absolute
        # coordinates that stretch beyond the end of the genome
        x = np.zeros(n_bins)
        x[:] = np.nan
    except KeyError:
        # probably requested a chromosome that doesn't exist (e.g. chrM)
        x = np.zeros(n_bins)
        x[:] = np.nan

    return x

示例#7

0

显示文件

def get(
    bw_path: str,
    chrom: str,
    start: int,
    end: int,
    bins: int,
    missing: float = 0.0,
):
    return bbi.fetch(bw_path, chrom, start, end, bins=bins, missing=missing)

示例#8

0

显示文件

文件： bigwig.py 项目： younglululu/peax

def chunk(bigwig, window_size, step_size, aggregation, chroms, verbose=False):
    base_bins = math.ceil(window_size / aggregation)

    chrom_values = []

    for chrom in chroms:
        if chrom not in bbi.chromsizes(bigwig):
            print(
                "Skipping chrom (not in bigWig file):",
                chrom,
                bbi.chromsizes(bigwig)[chrom],
            )
            continue

        chrom_size = bbi.chromsizes(bigwig)[chrom]

        values = np.zeros((math.ceil(
            (chrom_size - step_size) / step_size), base_bins))
        starts = np.arange(0, chrom_size - step_size, step_size)
        ends = np.append(np.arange(window_size, chrom_size, step_size),
                         chrom_size)
        bins = window_size / aggregation

        # Extract all but the last window in one fashion (faster than `fetch`
        # with loops)
        values[:-1] = bbi.stackup(
            bigwig,
            [chrom] * (starts.size - 1),
            starts[:-1],
            ends[:-1],
            bins=bins,
            missing=0.0,
        )
        final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation)
        # Extract the last window separately because it's size is likely to be
        # different from the others
        values[-1, :final_bins] = bbi.fetch(bigwig,
                                            chrom,
                                            starts[-1],
                                            ends[-1],
                                            bins=final_bins,
                                            missing=0.0)

        if verbose:
            print(
                "Chrom: {}".format(chrom),
                "# win: {}".format(values.shape[0]),
                "Max:   {}".format(np.max(values)),
            )

        chrom_values.append(values)

    return chrom_values

示例#9

0

显示文件

def fetch_data(a):
    (
        bwpath,
        binsize,
        chromsizes,
        aggregation_mode,
        range_mode,
        cid,
        start,
        end
    ) = a
    n_bins = int(np.ceil((end - start) / binsize))
    n_dim = 1

    if range_mode == 'minMax':
        n_dim = 2

    if range_mode == 'whisker':
        n_dim = 4

    x = np.zeros((n_bins, n_dim)) if n_dim > 1 else np.zeros(n_bins)

    try:
        chrom = chromsizes.index[cid]
        clen = chromsizes.values[cid]

        args = [bwpath, chrom, start, end]
        kwargs = {"bins": n_bins, "missing": np.nan}

        if range_mode == 'minMax':
            x[:, 0] = bbi.fetch(*args, **dict(kwargs, summary='min'))
            x[:, 1] = bbi.fetch(*args, **dict(kwargs, summary='max'))

        elif range_mode == 'whisker':
            x[:, 0] = bbi.fetch(*args, **dict(kwargs, summary='min'))
            x[:, 1] = bbi.fetch(*args, **dict(kwargs, summary='max'))
            x[:, 2] = bbi.fetch(*args, **dict(kwargs, summary='mean'))
            x[:, 3] = bbi.fetch(*args, **dict(kwargs, summary='std'))

        else:
            x[:] = bbi.fetch(*args, **dict(kwargs, summary=aggregation_mode))

        # drop the very last bin if it is smaller than the binsize
        if end == clen and clen % binsize != 0:
            x = x[:-1]
    except IndexError:
        # beyond the range of the available chromosomes
        # probably means we've requested a range of absolute
        # coordinates that stretch beyond the end of the genome
        x[:] = np.nan
    except KeyError:
        # probably requested a chromosome that doesn't exist (e.g. chrM)
        x[:] = np.nan

    return x

示例#10

0

显示文件

def get_bigwig_tile(bwpath, zoom_level, start_pos, end_pos, chromsizes=None):
    t1 = time.time()
    if chromsizes is None:
        chromsizes = get_chromsizes(bwpath)
    t2 = time.time()

    # print("chromosomes time:", t2 - t1)
    resolutions = get_zoom_resolutions(chromsizes)
    binsize = resolutions[zoom_level]

    arrays = []
    for cid, start, end in abs2genomic(chromsizes, start_pos, end_pos):
        n_bins = int(np.ceil((end - start) / binsize))
        try:
            chrom = chromsizes.index[cid]
            clen = chromsizes.values[cid]

            t1 = time.time()
            #print("fetching:", chrom, start, end, n_bins);
            x = bbi.fetch(bwpath,
                          chrom,
                          start,
                          end,
                          bins=n_bins,
                          missing=np.nan)
            t2 = time.time()
            '''
            if t2 - t1 > 0.5:
                print("fetching:", chrom, start, end, n_bins, "fetched time: {:.2f}".format(time.time() - t1))
            '''

            # drop the very last bin if it is smaller than the binsize
            if end == clen and clen % binsize != 0:
                x = x[:-1]
        except IndexError:
            # beyond the range of the available chromosomes
            # probably means we've requested a range of absolute
            # coordinates that stretch beyond the end of the genome
            x = np.zeros(n_bins)

        arrays.append(x)

    return np.concatenate(arrays)

示例#11

0

显示文件

文件： gtkserver.py 项目： lanl/4DGB

def SampleArray(arrayID, arraySlice, begin, end, numsamples):
    array = get_array_metadata(arrayID)

    data = []
    if ('sequence' in array['data']['values'][int(arraySlice)]):
        # there is a sequence array
        adata = array['data']['values'][int(arraySlice)]
        url = "{}/{}".format(PROJECT_HOME, adata['sequence']['url'])
        data = bbi.fetch(url, adata['sequence']['chrom'], int(begin), int(end),
                         int(numsamples))
    else:
        # there is a sequence array
        array = load_array_data(arrayID, arraySlice)
        interval = get_dataset_interval()
        sid = int(int(begin) / interval)
        # add one, to include the final element we want
        eid = int(int(end) / interval) + 1
        data = array['data']['values'][sid:eid]

    return jsonify({'data': list(data)})

示例#12

0

显示文件

文件： test_bbi.py 项目： snikumbh/pybbi

def test_fetch_missing(path):
    x = bbi.fetch(path, 'chr21', 0, 1000, oob=0)
    assert np.all(x[:10] == 0)
    x = bbi.fetch(path, 'chr21', 0, 1000, missing=np.nan)
    assert np.all(np.isnan(x[:10]))

示例#13

0

显示文件

文件： manifest_to_mv5.py 项目： hms-dbmi/cistrome-explorer

def bigwigs_to_multivec(input_bigwig_files, input_metadata_files, output_file,
                        starting_resolution):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Zip the input to create (bw, metadata) tuples
    zipped_input = zip(input_bigwig_files, input_metadata_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = chromosomes[:25]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name,
                                                   chr_shape,
                                                   dtype="f4",
                                                   fillvalue=np.nan,
                                                   compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution), num_samples)
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    resolutions_group[str(
                        resolution)]["values"][chr_name][:, bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for metadata_index, metadata_file in enumerate(input_metadata_files):
        with open(metadata_file) as mf:
            try:
                metadata_json = json.load(mf)
            except Exception as e:
                print(f"Error loading metadata file: {metadata_file}")
                print(e)
                metadata_json = None
        row_info = metadata_json_to_row_info(metadata_json)
        row_infos.append(row_info)

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()

示例#14

0

显示文件

def get_stats(bigwig, bigbed, norm_vals, window_size, step_size, aggregation,
              chrom):
    base_bins = math.ceil(window_size / aggregation)

    if chrom not in bbi.chromsizes(bigwig):
        print(
            "Skipping chrom (not in bigWig file):",
            chrom,
            bbi.chromsizes(bigwig)[chrom],
        )
        return None

    chrom_size = bbi.chromsizes(bigwig)[chrom]

    intervals = np.zeros((math.ceil(
        (chrom_size - step_size) / step_size), base_bins))
    starts = np.arange(0, chrom_size - step_size, step_size)
    ends = np.append(np.arange(window_size, chrom_size, step_size), chrom_size)
    bins = window_size / aggregation

    # Extract all but the last window in one fashion (faster than `fetch`
    # with loops)
    intervals[:-1] = bbi.stackup(bigbed, [chrom] * (starts.size - 1),
                                 starts[:-1],
                                 ends[:-1],
                                 bins=bins)

    final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation)
    # Extract the last window separately because it's size is likely to be
    # different from the others
    intervals[-1, :final_bins] = bbi.fetch(bigbed,
                                           chrom,
                                           starts[-1],
                                           ends[-1],
                                           bins=final_bins,
                                           missing=0.0)

    intervals = np.round(intervals).astype(int)

    # 0. Number of intevals
    # 1. Min width of peaks
    # 2. Max width of peaks
    # 3. Median width of peaks
    # 4. Min distance of peaks
    # 5. Max distance pf peaks
    # 6. Median distance of peaks
    # 7. Sum of height of peaks
    # 8. Max height of peaks
    # 9. Median height of peaks
    # 10. Median signal
    # 11. Total signal
    # 12. Peak coverage
    stats = np.zeros((norm_vals.shape[0], 13))

    stats[:, 0] = count_peaks(intervals)

    stats[:, 1] = peak_widths(intervals, np.min)
    stats[:, 2] = peak_widths(intervals, np.max)
    stats[:, 3] = peak_widths(intervals, np.median)

    stats[:, 4] = peak_distances(intervals, np.min)
    stats[:, 5] = peak_distances(intervals, np.max)
    stats[:, 6] = peak_distances(intervals, np.median)

    stats[:, 7] = peak_heights(intervals, norm_vals, stats[:, 0], np.nansum)
    stats[:, 8] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmax)
    stats[:, 9] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmedian)

    stats[:, 10] = np.median(norm_vals, axis=1)
    stats[:, 11] = np.sum(norm_vals, axis=1)
    stats[:, 12] = peak_widths(intervals, np.sum) / base_bins

    return stats, np.round(intervals).astype(int)

示例#15

0

显示文件

文件： cluster_bw_to_zarr.py 项目： keller-mark/vitessce-demo-sc-atac-seq-10x-genomics

def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution,
                    name):
    # Short-hand for creating a DirectoryStore with a root group.
    f = zarr.open(output_file, mode='w')
    compressor = Zlib(level=1)

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    chromosomes_group = f.create_group("chromosomes")

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = [str(chr_name) for chr_name in chromosomes[:25]
                   ]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_cumsum_arr = np.concatenate(
        (np.array([0]), np.cumsum(chroms_length_arr)))

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))
    chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each chromosome dataset.
    for chr_name, chr_len in chrom_name_to_length.items():
        chr_group = chromosomes_group.create_group(chr_name)
        # Create each resolution group.
        for resolution in resolutions:
            chr_shape = (num_samples, math.ceil(chr_len / resolution))
            chr_group.create_dataset(str(resolution),
                                     shape=chr_shape,
                                     dtype="f4",
                                     fill_value=np.nan,
                                     compressor=compressor)

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (num_samples, math.ceil(chr_len / resolution))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[1],
                                    summary="sum")
                    chromosomes_group[chr_name][str(resolution)][
                        bw_index, :] = arr
        else:
            print(f"{bw_file} not is_bigwig")

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for bw_index, bw_file in enumerate(input_bigwig_files):
        row_infos.append({
            "cluster": int(bw_index + 1),
            "file": os.path.basename(bw_file)
        })

    # f.attrs should contain all tileset_info properties
    # For zarr, more attributes are used here to allow "serverless"
    f.attrs['row_infos'] = row_infos
    f.attrs['resolutions'] = sorted(resolutions, reverse=True)
    f.attrs['shape'] = [num_samples, 256]
    f.attrs['name'] = name
    f.attrs['coordSystem'] = "hg38"

    # https://github.com/zarr-developers/zarr-specs/issues/50
    f.attrs['multiscales'] = [{
        "version":
        "0.1",
        "name":
        chr_name,
        "datasets": [{
            "path": f"chromosomes/{chr_name}/{resolution}"
        } for resolution in sorted(resolutions, reverse=True)],
        "type":
        "zarr-multivec",
        "metadata": {
            "chromoffset": int(chrom_name_to_cumsum[chr_name]),
            "chromsize": int(chr_len),
        }
    } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]

示例#16

0

显示文件

def chunk(
    bigwig,
    window_size,
    resolution,
    step_size,
    chroms,
    normalize=True,
    verbose=False,
):
    base_bins = np.ceil(window_size / resolution).astype(int)

    num_total_windows = 0
    bins = np.ceil(window_size / resolution).astype(int)

    for chrom in chroms:
        chrom_size = bbi.chromsizes(bigwig)[chrom]
        num_total_windows += np.ceil(
            (chrom_size - step_size) / step_size
        ).astype(int)

    values = np.zeros((num_total_windows, base_bins))

    start = 0
    for chrom in chroms:
        if chrom not in bbi.chromsizes(bigwig):
            print(
                "Skipping chrom (not in bigWig file):",
                chrom,
                bbi.chromsizes(bigwig)[chrom],
            )
            continue

        chrom_size = bbi.chromsizes(bigwig)[chrom]
        num_windows = np.ceil((chrom_size - step_size) / step_size).astype(int)

        start_bps = np.arange(0, chrom_size - step_size, step_size)
        end_bps = np.append(
            np.arange(window_size, chrom_size, step_size), chrom_size
        )

        end = start + num_windows

        # Extract all but the last window in one fashion (faster than `fetch`
        # with loops)
        values[start : end - 1] = bbi.stackup(
            bigwig,
            [chrom] * (start_bps.size - 1),
            start_bps[:-1],
            end_bps[:-1],
            bins=bins,
            missing=0,
        )
        final_bins = np.ceil(
            (end_bps[-1] - start_bps[-1]) / resolution
        ).astype(int)
        # Extract the last window separately because it's size is likely to be
        # different from the others
        values[end - 1, :final_bins] = bbi.fetch(
            bigwig,
            chrom,
            start_bps[-1],
            end_bps[-1],
            bins=final_bins,
            missing=0.0,
        )

        if normalize:
            values[start:end] = data.normalize(values[start:end])

        if verbose:
            print(
                "LOADING ::",
                "Chrom: {}".format(chrom),
                "| Num windows: {}".format(num_windows),
                "| Max value: {}".format(np.max(values[start:end])),
            )

    return values

示例#17

0

显示文件

文件： convert.py 项目： higlass/clodius

def bigwigs_to_multivec(
    filepaths,
    output_file,
    assembly,
    chromsizes_filename,
    row_infos_filename,
    tile_size,
):
    with tempfile.TemporaryDirectory() as td:
        print("temporary dir:", td)

        temp_file = op.join(td, "temp.mv5")
        f_out = h5py.File(temp_file, "w")

        (chrom_info, chrom_names,
         chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, "r") as f:
                row_infos = [line.strip().encode("utf8") for line in f]

        else:
            row_infos = None

        starting_resolution = 1
        resolution = starting_resolution
        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(
                chrom,
                (
                    math.ceil(
                        chrom_info.chrom_lengths[chrom] / starting_resolution),
                    len(filepaths),
                ),
                fillvalue=np.nan,
                compression="gzip",
            )

        # Fill in data for each bigwig file.
        for bw_index, bw_file in tqdm(list(enumerate(filepaths)),
                                      desc="bigwigs"):
            if bbi.is_bigwig(bw_file):
                chromsizes = bbi.chromsizes(bw_file)
                matching_chromosomes = set(chromsizes.keys()).intersection(
                    set(chrom_names))

                # Fill in data for each resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    print("chr_name:", chr_name, resolution)
                    chr_len = chrom_info.chrom_lengths[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution),
                                 len(filepaths))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    f_out[chr_name][:, bw_index] = arr
            else:
                print(f"{bw_file} not is_bigwig")

        f_out.flush()

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, "r")

        def agg(x):
            return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(
            f_in,
            chromsizes=zip(chrom_names, chrom_lengths),
            agg=agg,
            starting_resolution=starting_resolution,
            tile_size=tile_size,
            output_file=output_file,
            row_infos=row_infos,
        )

示例#18

0

显示文件

文件： bigwig_to_multivec.py 项目： flekschas/enhancer-gene-vis

def bigwigs_to_multivec(
    input_bigwig_files,
    output_file,
    starting_resolution
):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder(GENOME_BUILD)
    chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used?
    chroms_length_arr = np.array([ nc.get_chrominfo('hg19').chrom_lengths[x] for x in chromosomes ], dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    num_zoom_levels = math.floor(math.log2(GENOME_LENGTH / starting_resolution))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2 ** x) for x in range(num_zoom_levels)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in enumerate(input_bigwig_files):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    num_bins = math.ceil(chr_len / resolution)
                    arr = bbi.fetch(bw_file, chr_name, 0, chr_len, num_bins, summary="sum")
                    resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for input_bigwig_file in input_bigwig_files:
        _, filename = os.path.split(input_bigwig_file)
        name, _ = os.path.splitext(filename)
        row_infos.append({
            'id': name
        })

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()