Exemplo n.º 1
0
    def test_load_kernel_config(self):
        """
        Check that json config files can be parsed properly and that bad
        configs throw errors.
        """
        # Generate dummy kernel
        m = np.random.random((17, 17))
        kernel_mat_path = self.tmp_path + "kernel"
        # Write kernel matrix to disk
        with open(kernel_mat_path, "w") as kernel:
            np.savetxt(kernel, m)

        # Generate associated config pointing to the kernel
        exp_config = {
            "name": "test_pattern",
            "kernels": [kernel_mat_path],
            "min_dist": 0,
            "max_dist": 10,
            "max_iterations": 1,
            "max_perc_undetected": 10,
            "min_separation": 1,
            "precision": 4,
            "resolution": 1000,
        }
        # Write config to disk
        json.dump(exp_config, open(self.tmp_path, "w"))
        # Load kernel configs and check if values are correct
        obs_config_raw = cio.load_kernel_config(self.tmp_path, custom=True)
        obs_kernel_raw = obs_config_raw["kernels"][0]
        for param in exp_config.keys():
            if param != "kernels":
                assert exp_config[param] == obs_config_raw[param]
        # check if matrix is preserved
        assert np.all(obs_kernel_raw == m)

        # Check if non-existing config yields explicit error
        try:
            cio.load_kernel_config(self.tmp_path + "donotexist", custom=True)
            assert False
        except OSError:
            assert True
        # Check if wrong values in config yields explicit error
        bad_config = exp_config.copy()
        bad_config["max_dist"] = -1
        json.dump(bad_config, open(self.tmp_path, "w"))
        try:
            cio.load_kernel_config(self.tmp_path, custom=True)
            assert False
        except ValidationError:
            assert True
        # Check if missing parameters in config yields explicit error
        bad_config = exp_config.copy()
        bad_config.pop("precision")
        json.dump(bad_config, open(self.tmp_path, "w"))
        try:
            cio.load_kernel_config(self.tmp_path, custom=True)
            assert False
        except ValidationError:
            assert True
        os.unlink(kernel_mat_path)
Exemplo n.º 2
0
def cmd_detect(arguments):
    # Parse command line arguments for detect
    kernel_config_path = arguments["--kernel-config"]
    dump = arguments["--dump"]
    interchrom = arguments["--inter"]
    iterations = arguments["--iterations"]
    mat_path = arguments["<contact_map>"]
    max_dist = arguments["--max-dist"]
    min_dist = arguments["--min-dist"]
    min_separation = arguments["--min-separation"]
    n_mads = float(arguments["--n-mads"])
    pattern = arguments["--pattern"]
    perc_undetected = arguments["--perc-undetected"]
    precision = arguments["--precision"]
    resize = arguments["--resize-kernel"]
    threads = arguments["--threads"]
    output = arguments["<output>"]
    win_fmt = arguments["--win-fmt"]
    subsample = arguments["--subsample"]
    if subsample == "no":
        subsample = None
    plotting_enabled = False if arguments["--no-plotting"] else True
    smooth_trend = arguments["--smooth-trend"]
    if smooth_trend is None:
        smooth_trend = False
    # If output is not specified, use current directory
    if not output:
        output = pathlib.Path()
    else:
        output = pathlib.Path(output)
    output.mkdir(exist_ok=True)

    if win_fmt not in ["npy", "json"]:
        sys.stderr.write("Error: --win-fmt must be either json or npy.\n")
        sys.exit(1)
    # Read a user-provided kernel config if custom is true
    # Else, load a preset kernel config for input pattern
    # Configs are JSON files containing all parameter associated with the pattern
    # They are loaded into a dictionary in the form :
    # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...}
    # Where each kernel is a 2D numpy array representing the pattern
    if kernel_config_path is not None:
        custom = True
        # Loading input path as config
        config_path = kernel_config_path
    else:
        custom = False
        # Will use a preset config file matching pattern name
        config_path = pattern

    ### 0: LOAD INPUT
    params = {
        "max_iterations": (iterations, int),
        "precision": (precision, float),
        "max_dist": (max_dist, int),
        "min_dist": (min_dist, int),
        "min_separation": (min_separation, int),
        "max_perc_undetected": (perc_undetected, float),
    }
    kernel_config = cio.load_kernel_config(config_path, custom)
    for param_name, (param_value, param_type) in params.items():
        kernel_config = _override_kernel_config(
            param_name, param_value, param_type, kernel_config
        )

    # NOTE: Temporary warning
    if interchrom:
        sys.stderr.write(
            "WARNING: Detection on interchromosomal matrices is expensive in RAM\n"
        )
    hic_genome = HicGenome(
        mat_path,
        inter=interchrom,
        kernel_config=kernel_config,
        dump=dump,
        smooth=smooth_trend,
    )
    ### 1: Process input signal
    #  Adapt size of kernel matrices based on the signal resolution
    if resize:
        for i, mat in enumerate(kernel_config["kernels"]):
            kernel_config["kernels"][i] = resize_kernel(
                mat,
                kernel_res=kernel_config["resolution"],
                signal_res=hic_genome.resolution,
            )
    hic_genome.kernel_config = kernel_config
    # Subsample Hi-C contacts from the matrix, if requested
    # NOTE: Subsampling has to be done before normalisation
    hic_genome.subsample(subsample)
    # Normalize (balance) matrix using ICE
    hic_genome.normalize(n_mads=n_mads)
    # Define how many diagonals should be used in intra-matrices
    hic_genome.compute_max_dist()
    # Split whole genome matrix into intra- and inter- sub matrices. Each sub
    # matrix is processed on the fly (obs / exp, trimming diagonals > max dist)
    hic_genome.make_sub_matrices()

    all_pattern_coords = []
    all_pattern_windows = []

    ### 2: DETECTION ON EACH SUBMATRIX
    pool = mp.Pool(int(threads))
    n_sub_mats = hic_genome.sub_mats.shape[0]
    # Loop over the different kernel matrices for input pattern
    run_id = 0
    total_runs = (
        len(kernel_config["kernels"]) * kernel_config["max_iterations"]
    )
    sys.stderr.write("Detecting patterns...\n")
    for kernel_id, kernel_matrix in enumerate(kernel_config["kernels"]):
        # Adjust kernel iteratively
        for i in range(kernel_config["max_iterations"]):
            cio.progress(
                run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n"
            )

            # Apply detection procedure to all sub matrices in parallel
            sub_mat_data = zip(
                hic_genome.sub_mats.iterrows(),
                [kernel_config for i in range(n_sub_mats)],
                [kernel_matrix for i in range(n_sub_mats)],
                [dump for i in range(n_sub_mats)],
            )
            # Run detection in parallel on different sub matrices, and show progress when
            # gathering results
            sub_mat_results = []
            for i, result in enumerate(pool.imap_unordered(_detect_sub_mat, sub_mat_data, 1)):
                chr1 = hic_genome.sub_mats.chr1[i]
                chr2 = hic_genome.sub_mats.chr2[i]
                cio.progress(i, n_sub_mats, f"{chr1}-{chr2}")
                sub_mat_results.append(result)
            #sub_mat_results = map(_detect_sub_mat, sub_mat_data)
            # Convert coordinates from chromosome to whole genome bins
            kernel_coords = [
                hic_genome.get_full_mat_pattern(
                    d["chr1"], d["chr2"], d["coords"]
                )
                for d in sub_mat_results
                if d["coords"] is not None
            ]

            # Gather newly detected pattern coordinates
            try:
                # Extract surrounding windows for each sub_matrix
                kernel_windows = np.concatenate(
                    [
                        w["windows"]
                        for w in sub_mat_results
                        if w["windows"] is not None
                    ],
                    axis=0,
                )
                all_pattern_coords.append(
                    pd.concat(kernel_coords, axis=0).reset_index(drop=True)
                )
                # Add info about kernel and iteration which detected these patterns
                all_pattern_coords[-1]["kernel_id"] = kernel_id
                all_pattern_coords[-1]["iteration"] = i
                all_pattern_windows.append(kernel_windows)

            # If no pattern was found with this kernel
            # skip directly to the next one, skipping iterations
            except ValueError:
                break

            # Update kernel with patterns detected at current iteration
            kernel_matrix = cid.pileup_patterns(kernel_windows)
            run_id += 1
    cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n")

    # If no pattern detected on any chromosome, with any kernel, exit gracefully
    if len(all_pattern_coords) == 0:
        sys.stderr.write("No pattern detected ! Exiting.\n")
        sys.exit(0)

    # Combine patterns of all kernel matrices into a single array
    all_pattern_coords = pd.concat(all_pattern_coords, axis=0).reset_index(
        drop=True
    )
    # Combine all windows from different kernels into a single pile of windows
    all_pattern_windows = np.concatenate(all_pattern_windows, axis=0)

    # Compute minimum separation in bins and make sure it has a reasonable value
    separation_bins = int(
        kernel_config["min_separation"] // hic_genome.resolution
    )
    if separation_bins < 1:
        separation_bins = 1
    print(f"Minimum pattern separation is : {separation_bins}")
    # Remove patterns with overlapping windows (smeared patterns)
    distinct_patterns = cid.remove_neighbours(
        all_pattern_coords, win_size=separation_bins
    )

    # Drop patterns that are too close to each other
    all_pattern_coords = all_pattern_coords.loc[distinct_patterns, :]
    all_pattern_windows = all_pattern_windows[distinct_patterns, :, :]

    # Get from bins into basepair coordinates
    coords_1 = hic_genome.bins_to_coords(all_pattern_coords.bin1).reset_index(
        drop=True
    )
    coords_1.columns = [str(col) + "1" for col in coords_1.columns]
    coords_2 = hic_genome.bins_to_coords(all_pattern_coords.bin2).reset_index(
        drop=True
    )
    coords_2.columns = [str(col) + "2" for col in coords_2.columns]

    all_pattern_coords = pd.concat(
        [all_pattern_coords.reset_index(drop=True), coords_1, coords_2], axis=1
    )

    # Filter patterns closer than minimum distance from the diagonal if any
    min_dist_drop_mask = (
        all_pattern_coords.chrom1 == all_pattern_coords.chrom2
    ) & (
        np.abs(all_pattern_coords.start2 - all_pattern_coords.start1)
        < int(kernel_config["min_dist"])
    )
    # Reorder columns at the same time
    all_pattern_coords = all_pattern_coords.loc[
        ~min_dist_drop_mask,
        [
            "chrom1",
            "start1",
            "end1",
            "chrom2",
            "start2",
            "end2",
            "bin1",
            "bin2",
            "kernel_id",
            "iteration",
            "score",
        ],
    ]
    all_pattern_windows = all_pattern_windows[~min_dist_drop_mask, :, :]

    ### 3: WRITE OUTPUT
    sys.stderr.write(f"{all_pattern_coords.shape[0]} patterns detected\n")
    # Save patterns and their coordinates in a tsv file
    cio.write_patterns(
        all_pattern_coords, kernel_config["name"] + "_out", output
    )
    # Save windows as an array in an npy file
    cio.save_windows(
        all_pattern_windows,
        kernel_config["name"] + "_out",
        output,
        format=win_fmt,
    )

    # Generate pileup visualisations if requested
    if plotting_enabled:
        # Compute and plot pileup
        pileup_fname = ("pileup_of_{n}_{pattern}").format(
            pattern=kernel_config["name"], n=all_pattern_windows.shape[0]
        )
        windows_pileup = cid.pileup_patterns(all_pattern_windows)
        pileup_plot(windows_pileup, name=pileup_fname, output=output)
Exemplo n.º 3
0
def cmd_generate_config(arguments):
    # Parse command line arguments for generate_config
    prefix = arguments["<prefix>"]
    pattern = arguments["--preset"]
    click_find = arguments["--click"]
    n_mads = float(arguments["--n-mads"])
    win_size = arguments["--win-size"]

    cfg = cio.load_kernel_config(pattern, False)

    # If prefix involves a directory, create it
    if os.path.dirname(prefix):
        os.makedirs(os.path.dirname(prefix), exist_ok=True)

    # If a specific window size if requested, resize all kernels 
    if win_size != "auto":
        win_size = int(win_size)
        resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0])
        cfg['kernels'] = [resize(k) for k in cfg['kernels']]
    # Otherwise, just inherit window size from the kernel config
    else:
        win_size = cfg["kernels"][0].shape[0]

    # If click mode is enabled, build a kernel from scratch using
    # graphical display, otherwise, just inherit the pattern's kernel
    if click_find:
        hic_genome = HicGenome(
            click_find,
            inter=True,
            kernel_config=cfg,
        )
        # Normalize (balance) the whole genome matrix
        hic_genome.normalize(n_mads=n_mads)
        # enforce full scanning distance in kernel config
        
        hic_genome.max_dist = hic_genome.matrix.shape[0] * hic_genome.resolution
        # Process each sub-matrix individually (detrend diag for intra)
        hic_genome.make_sub_matrices()
        processed_mat = hic_genome.gather_sub_matrices().tocsr()
        windows = click_finder(processed_mat, half_w=int((win_size - 1) / 2))
        # Pileup all recorded windows and convert to JSON serializable list
        pileup = ndi.gaussian_filter(cid.pileup_patterns(windows), 1)
        cfg['kernels'] = [pileup.tolist()]
        # Show the newly generate kernel to the user, use zscore to highlight contrast
        hm = plt.imshow(
                np.log(pileup),
                vmax=np.percentile(pileup, 99),
                cmap='afmhot_r',
        )
        cbar = plt.colorbar(hm)
        cbar.set_label('Log10 Hi-C contacts')
        plt.title("Manually generated kernel")
        plt.show()
    # Write kernel matrices to files with input prefix and replace kernels
    # by their path in config
    for mat_id, mat in enumerate(cfg["kernels"]):
        mat_path = f"{prefix}.{mat_id+1}.txt"
        np.savetxt(mat_path, mat)
        cfg["kernels"][mat_id] = mat_path

    # Write config to JSON file using prefix
    with open(f"{prefix}.json", "w") as config_handle:
        json.dump(cfg, config_handle, indent=4)
Exemplo n.º 4
0
def cmd_quantify(arguments):
    bed2d_path = arguments["<bed2d>"]
    mat_path = arguments["<contact_map>"]
    output = pathlib.Path(arguments["<output>"])
    n_mads = float(arguments["--n-mads"])
    pattern = arguments["--pattern"]
    inter = arguments["--inter"]
    win_size = arguments["--win-size"]
    if win_size != "auto":
        win_size = int(win_size)
    subsample = arguments["--subsample"]
    # Create directory if it does not exist
    if not output.exists():
        os.makedirs(output, exist_ok=True)
    # Load 6 cols from 2D BED file and infer header
    bed2d = cio.load_bed2d(bed2d_path)
    # Warn user if --inter is disabled but list contains inter patterns
    if not inter and len(bed2d.start1[bed2d.chrom1 != bed2d.chrom2]) > 0:
        sys.stderr.write(
            "Warning: The bed2d file contains interchromosomal patterns. "
            "These patterns will not be scanned unless --inter is used.\n"
        )
    # Parse kernel config
    kernel_config = cio.load_kernel_config(pattern, False)
    # Instantiate and preprocess contact map
    hic_genome = HicGenome(mat_path, inter=inter, kernel_config=kernel_config)
    # enforce full scanning distance in kernel config
    kernel_config["max_dist"] = (
        hic_genome.matrix.shape[0] * hic_genome.resolution
    )
    kernel_config["min_dist"] = 0
    # Notify contact map instance of changes in scanning distance
    hic_genome.kernel_config = kernel_config
    # Subsample Hi-C contacts from the matrix, if requested
    if subsample != "no":
        hic_genome.subsample(subsample)
    # Normalize (balance) matrix using ICE
    hic_genome.normalize(n_mads)
    # Define how many diagonals should be used in intra-matrices
    hic_genome.compute_max_dist()
    # Split whole genome matrix into intra- and inter- sub matrices. Each sub
    # matrix is processed on the fly (obs / exp, trimming diagonals > max dist)
    hic_genome.make_sub_matrices()
    # Initialize output structures
    bed2d["score"] = 0.0
    positions = bed2d.copy()
    if win_size != "auto":
        km = kn = win_size
    else:
        km, kn = kernel_config["kernels"][0].shape
    windows = np.zeros((positions.shape[0], km, kn))
    # For each position, we use the center of the BED interval
    positions["pos1"] = (positions.start1 + positions.end1) // 2
    positions["pos2"] = (positions.start2 + positions.end2) // 2
    # Use each kernel matrix available for the pattern
    for kernel_id, kernel_matrix in enumerate(kernel_config["kernels"]):
        # Only resize kernel matrix if explicitely requested
        if win_size != "auto":
            kernel_matrix = resize_kernel(kernel_matrix, factor=win_size / km)
        kh = (km - 1) // 2
        kw = (kn - 1) // 2
        # Iterate over intra- and inter-chromosomal sub-matrices
        for sub_mat in hic_genome.sub_mats.iterrows():
            mat = sub_mat[1]
            # Filter patterns falling onto this sub-matrix
            sub_pat = positions.loc[
                (positions.chrom1 == mat.chr1) & (positions.chrom2 == mat.chr2)
            ]
            sub_pat_idx = sub_pat.index.values
            # Convert genomic coordinates to bins for horizontal and vertical axes
            for ax in [1, 2]:
                sub_pat_ax = sub_pat.loc[:, [f"chrom{ax}", f"pos{ax}"]].rename(
                    columns={f"chrom{ax}": "chrom", f"pos{ax}": "pos"}
                )
                sub_pat_bins = hic_genome.coords_to_bins(sub_pat_ax)
                sub_pat[f"bin{ax}"] = sub_pat_bins

            # Check for nan bins (coords that do not match any Hi-C fragments
            fall_out = np.isnan(sub_pat['bin1']) | np.isnan(sub_pat['bin2'])
            if np.any(fall_out):
                n_out = len(sub_pat_bins[fall_out])
                sys.stderr.write(
                    f"{n_out} entr{'ies' if n_out > 1 else 'y'} outside "
                    "genomic coordinates of the Hi-C matrix will be ignored.\n"
                )
            # Convert bins from whole genome matrix to sub matrix
            sub_pat = hic_genome.get_sub_mat_pattern(
                mat.chr1, mat.chr2, sub_pat
            )
            m = mat.contact_map.matrix.tocsr()
            # Iterate over patterns from the 2D BED file
            for i, x, y in zip(sub_pat_idx, sub_pat.bin1, sub_pat.bin2):
                # Check if the window goes out of bound
                if  np.all(np.isfinite([x, y])) and (
                    x - kh >= 0
                    and x + kh + 1 < m.shape[0]
                    and y - kw >= 0
                    and y + kw + 1 < m.shape[1]
                ):
                    x = int(x)
                    y = int(y)
                    # For each pattern, compute correlation score with all kernels
                    # but only keep the best
                    win = m[x - kh : x + kh + 1, y - kw : y + kw + 1].toarray()
                    try:
                        score = ss.pearsonr(
                            win.flatten(), kernel_matrix.flatten()
                        )[0]
                    # In case of NaNs introduced by division by 0 during detrend
                    except ValueError:
                        score = 0
                    if score > bed2d["score"][i] or kernel_id == 0:
                        bed2d["score"][i] = score
                # Pattern falls outside or at the edge of the matrix
                else:
                    win = np.zeros((km, kn))
                    bed2d["score"][i] = np.nan
                if kernel_id == 0:
                    windows[i, :, :] = win
        bed2d.to_csv(
            output / f"{pattern}_quant.txt", sep="\t", header=True, index=False
        )
        cio.save_windows(
            windows,
            f"{pattern}_quant",
            output_dir=output,
            format=arguments["--win-fmt"],
        )
Exemplo n.º 5
0
def cmd_detect(args):
    # Parse command line arguments for detect
    dump = args["--dump"]
    norm = args["--norm"]
    interchrom = args["--inter"]
    iterations = args["--iterations"]
    kernel_config_path = args["--kernel-config"]
    mat_path = args["<contact_map>"]
    max_dist = args["--max-dist"]
    min_dist = args["--min-dist"]
    min_separation = args["--min-separation"]
    n_mads = float(args["--n-mads"])
    prefix = args["<prefix>"]
    pattern = args["--pattern"]
    pearson = args["--pearson"]
    perc_zero = args["--perc-zero"]
    perc_undetected = args["--perc-undetected"]
    subsample = args["--subsample"]
    threads = int(args["--threads"])
    tsvd = 0.999 if args["--tsvd"] else None
    win_fmt = args["--win-fmt"]
    win_size = args["--win-size"]
    if subsample == "no":
        subsample = None
    plotting_enabled = False if args["--no-plotting"] else True
    smooth_trend = args["--smooth-trend"]
    if smooth_trend is None:
        smooth_trend = False

    # If prefix involves a directory, crash if it does not exist
    cio.check_prefix_dir(prefix)

    if win_fmt not in ["npy", "json"]:
        sys.stderr.write("Error: --win-fmt must be either json or npy.\n")
        sys.exit(1)
    # Read a user-provided kernel config if custom is true
    # Else, load a preset kernel config for input pattern
    # Configs are JSON files containing all parameter associated with the pattern
    # They are loaded into a dictionary in the form :
    # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...}
    # Where each kernel is a 2D numpy array representing the pattern
    if kernel_config_path is not None:
        custom = True
        # Loading input path as config
        config_path = kernel_config_path
    else:
        custom = False
        # Will use a preset config file matching pattern name
        config_path = pattern

    ### 0: LOAD INPUT
    params = {
        "max_iterations": (iterations, int),
        "pearson": (pearson, float),
        "max_dist": (max_dist, int),
        "min_dist": (min_dist, int),
        "min_separation": (min_separation, int),
        "max_perc_undetected": (perc_undetected, float),
        "max_perc_zero": (perc_zero, float),
    }
    cfg = cio.load_kernel_config(config_path, custom)
    for param_name, (param_value, param_type) in params.items():
        cfg = _override_kernel_config(param_name, param_value, param_type, cfg)

    # Resize kernels if requested
    if win_size != "auto":
        win_size = int(win_size)
        if not win_size % 2:
            raise ValueError("--win-size must be odd")
        resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0])
        cfg["kernels"] = [resize(k) for k in cfg["kernels"]]

    if interchrom:
        sys.stderr.write(
            "WARNING: Detection on interchromosomal matrices is expensive in RAM\n"
        )
    hic_genome = HicGenome(
        mat_path,
        inter=interchrom,
        kernel_config=cfg,
        dump=dump,
        smooth=smooth_trend,
        sample=subsample,
    )
    ### 1: Process input signal
    hic_genome.kernel_config = cfg
    # Normalize (balance) matrix using ICE
    hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads)
    # Define how many diagonals should be used in intra-matrices
    hic_genome.compute_max_dist()
    # Split whole genome matrix into intra- and inter- sub matrices. Each sub
    # matrix is processed on the fly (obs / exp, trimming diagonals > max dist)
    hic_genome.make_sub_matrices()

    all_coords = []
    all_windows = []

    ### 2: DETECTION ON EACH SUBMATRIX
    n_sub_mats = hic_genome.sub_mats.shape[0]
    # Loop over the different kernel matrices for input pattern
    run_id = 0
    # Use cfg to inform jobs whether they should run full convolution
    cfg["tsvd"] = tsvd
    total_runs = len(cfg["kernels"]) * cfg["max_iterations"]
    sys.stderr.write("Detecting patterns...\n")
    for kernel_id, kernel_matrix in enumerate(cfg["kernels"]):
        # Adjust kernel iteratively
        for i in range(cfg["max_iterations"]):
            cio.progress(
                run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n"
            )

            # Apply detection procedure to all sub matrices in parallel
            sub_mat_data = zip(
                hic_genome.sub_mats.iterrows(),
                [cfg for i in range(n_sub_mats)],
                [kernel_matrix for i in range(n_sub_mats)],
                [dump for i in range(n_sub_mats)],
            )
            # Run detection in parallel on different sub matrices, and show progress when
            # gathering results
            sub_mat_results = []
            # Run in multiprocessing subprocesses
            if threads > 1:
                pool = mp.Pool(threads)
                dispatcher = pool.imap(_detect_sub_mat, sub_mat_data, 1)
            else:
                dispatcher = map(_detect_sub_mat, sub_mat_data)
            for s, result in enumerate(dispatcher):
                cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}")
                sub_mat_results.append(result)

            # Convert coordinates from chromosome to whole genome bins
            kernel_coords = [
                hic_genome.get_full_mat_pattern(
                    d["chr1"], d["chr2"], d["coords"]
                )
                for d in sub_mat_results
                if d["coords"] is not None
            ]

            # Gather newly detected pattern coordinates
            try:
                # Extract surrounding windows for each sub_matrix
                kernel_windows = np.concatenate(
                    [
                        w["windows"]
                        for w in sub_mat_results
                        if w["windows"] is not None
                    ],
                    axis=0,
                )
                all_coords.append(
                    pd.concat(kernel_coords, axis=0).reset_index(drop=True)
                )
                # Add info about kernel and iteration which detected these patterns
                all_coords[-1]["kernel_id"] = kernel_id
                all_coords[-1]["iteration"] = i
                all_windows.append(kernel_windows)

            # If no pattern was found with this kernel
            # skip directly to the next one, skipping iterations
            except ValueError:
                break

            # Update kernel with patterns detected at current iteration
            kernel_matrix = cid.pileup_patterns(kernel_windows)
            run_id += 1
    cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n")
    # If no pattern detected on any chromosome, with any kernel, exit gracefully
    if len(all_coords) == 0:
        sys.stderr.write("No pattern detected ! Exiting.\n")
        sys.exit(0)
    # Finish parallelized part
    if threads > 1:
        pool.close()
    # Combine patterns of all kernel matrices into a single array
    all_coords = pd.concat(all_coords, axis=0).reset_index(drop=True)
    # Combine all windows from different kernels into a single pile of windows
    all_windows = np.concatenate(all_windows, axis=0)

    # Compute minimum separation in bins and make sure it has a reasonable value
    separation_bins = int(cfg["min_separation"] // hic_genome.clr.binsize)
    if separation_bins < 1:
        separation_bins = 1
    print(f"Minimum pattern separation is : {separation_bins}")
    # Remove patterns with overlapping windows (smeared patterns)
    distinct_patterns = cid.remove_neighbours(
        all_coords, win_size=separation_bins
    )

    # Drop patterns that are too close to each other
    all_coords = all_coords.loc[distinct_patterns, :]
    all_windows = all_windows[distinct_patterns, :, :]

    # Get from bins into basepair coordinates
    coords_1 = hic_genome.bins_to_coords(all_coords.bin1).reset_index(
        drop=True
    )
    coords_1.columns = [str(col) + "1" for col in coords_1.columns]
    coords_2 = hic_genome.bins_to_coords(all_coords.bin2).reset_index(
        drop=True
    )
    coords_2.columns = [str(col) + "2" for col in coords_2.columns]

    all_coords = pd.concat(
        [all_coords.reset_index(drop=True), coords_1, coords_2], axis=1
    )

    # Filter patterns closer than minimum distance from the diagonal if any
    min_dist_drop_mask = (all_coords.chrom1 == all_coords.chrom2) & (
        np.abs(all_coords.start2 - all_coords.start1) < cfg["min_dist"]
    )
    all_coords = all_coords.loc[~min_dist_drop_mask, :]
    all_windows = all_windows[~min_dist_drop_mask, :, :]
    del min_dist_drop_mask

    # Remove patterns with nan p-values (no contact in window)
    pval_mask = all_coords.pvalue.isnull()
    all_coords = all_coords.loc[~pval_mask, :]
    all_windows = all_windows[~pval_mask, :, :]
    del pval_mask
    # Correct p-values for multiple testing using FDR
    all_coords["qvalue"] = fdr_correction(all_coords["pvalue"])
    # Reorder columns
    all_coords = all_coords.loc[
        :,
        [
            "chrom1",
            "start1",
            "end1",
            "chrom2",
            "start2",
            "end2",
            "bin1",
            "bin2",
            "kernel_id",
            "iteration",
            "score",
            "pvalue",
            "qvalue",
        ],
    ]

    ### 3: WRITE OUTPUT
    sys.stderr.write(f"{all_coords.shape[0]} patterns detected\n")
    # Save patterns and their coordinates in a tsv file

    sys.stderr.write(f"Saving patterns in {prefix}.tsv\n")
    cio.write_patterns(all_coords, prefix)
    # Save windows as an array in an npy file

    sys.stderr.write(f"Saving patterns in {prefix}.{win_fmt}\n")
    cio.save_windows(all_windows, prefix, fmt=win_fmt)

    # Generate pileup visualisations if requested
    if plotting_enabled:
        # Compute and plot pileup
        pileup_title = ("Pileup of {n} {pattern}").format(
            pattern=cfg["name"], n=all_windows.shape[0]
        )
        windows_pileup = cid.pileup_patterns(all_windows)
        # Symmetrize pileup for diagonal patterns
        if not cfg["max_dist"]:
            # Replace nan below diag by 0
            windows_pileup = np.nan_to_num(windows_pileup)
            # Add transpose
            windows_pileup += np.transpose(windows_pileup) - np.diag(
                np.diag(windows_pileup)
            )
        sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n")
        pileup_plot(windows_pileup, prefix, name=pileup_title)
Exemplo n.º 6
0
def cmd_generate_config(args):
    # Parse command line args for generate_config
    prefix = args["<prefix>"]
    pattern = args["--preset"]
    click_find = args["--click"]
    n_mads = float(args["--n-mads"])
    norm = args["--norm"]
    win_size = args["--win-size"]
    threads = int(args["--threads"])
    inter = args["--inter"]
    chroms = args["--chroms"]

    cfg = cio.load_kernel_config(pattern, False)

    # If prefix involves a directory, crash if it does not exist
    cio.check_prefix_dir(prefix)

    # If a specific window size if requested, resize all kernels
    if win_size != "auto":
        win_size = int(win_size)
        if not win_size % 2:
            raise ValueError("--win-size must be odd")
        resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0])
        cfg["kernels"] = [resize(k) for k in cfg["kernels"]]
    # Otherwise, just inherit window size from the kernel config
    else:
        win_size = cfg["kernels"][0].shape[0]

    # If click mode is enabled, build a kernel from scratch using
    # graphical display, otherwise, just inherit the pattern's kernel
    if click_find:
        hic_genome = HicGenome(click_find, inter=inter, kernel_config=cfg)
        # Normalize (balance) the whole genome matrix
        hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads)
        # enforce full scanning distance in kernel config

        hic_genome.max_dist = hic_genome.clr.shape[0] * hic_genome.clr.binsize
        # Process each sub-matrix individually (detrend diag for intra)
        hic_genome.make_sub_matrices()
        # By default, the whole genome is showed at once (takes lots of RAM)
        if chroms is None:
            for sub in hic_genome.sub_mats.iterrows():
                sub_mat = sub[1].contact_map
                sub_mat.create_mat()
            processed_mat = hic_genome.gather_sub_matrices().tocsr()
            windows = click_finder(processed_mat, half_w=int((win_size - 1) / 2))
        # If chromosomes were specified, their submatrices are shown one by one
        # taking less memory (but more tedious for the user)
        else:
            chroms = chroms.split(',')
            # Generate chromosome pairs to scan
            if inter:
                chroms = it.combinations_with_replacement(chroms, 2)
            else:
                chroms = [(ch, ch) for ch in chroms]
            windows = []
            for c1, c2 in chroms:
                try:
                    sub_mat = hic_genome.sub_mats.query(
                            '(chr1 == @c1) & (chr2 == @c2)'
                    )['contact_map'].values[0]
                # In case chromosomes have been entered in a different order
                except IndexError:
                    c1, c2 = c2, c1
                    sub_mat = hic_genome.sub_mats.query(
                            '(chr1 == @c1) & (chr2 == @c2)'
                    )['contact_map'].values[0]
                sub_mat.create_mat()
                chrom_wins = click_finder(
                        sub_mat.matrix.tocsr(),
                        half_w=int((win_size - 1) / 2),
                        xlab=c2,
                        ylab=c1
                )
                windows.append(chrom_wins)
                sub_mat.destroy_mat()
            windows = np.concatenate(windows, axis=0)


        # Pileup all recorded windows and convert to JSON serializable list
        pileup = ndi.gaussian_filter(cid.pileup_patterns(windows), 1)
        cfg["kernels"] = [pileup.tolist()]
        # Show the newly generate kernel to the user, use zscore to highlight contrast
        hm = plt.imshow(
            np.log(pileup), vmax=np.percentile(pileup, 99), cmap="afmhot_r"
        )
        cbar = plt.colorbar(hm)
        cbar.set_label("Log10 Hi-C contacts")
        plt.title("Manually generated kernel")
        plt.show()
    # Write kernel matrices to files with input prefix and replace kernels
    # by their path in config
    for mat_id, mat in enumerate(cfg["kernels"]):
        mat_path = f"{prefix}.{mat_id+1}.txt"
        np.savetxt(mat_path, mat)
        cfg["kernels"][mat_id] = mat_path

    # Write config to JSON file using prefix
    with open(f"{prefix}.json", "w") as config_handle:
        json.dump(cfg, config_handle, indent=4)
Exemplo n.º 7
0
def cmd_quantify(args):
    bed2d_path = args["<bed2d>"]
    mat_path = args["<contact_map>"]
    prefix = args["<prefix>"]
    n_mads = float(args["--n-mads"])
    pattern = args["--pattern"]
    inter = args["--inter"]
    kernel_config_path = args["--kernel-config"]
    perc_zero = args["--perc-zero"]
    perc_undetected = args["--perc-undetected"]
    plotting_enabled = False if args["--no-plotting"] else True
    threads = int(args["--threads"])
    norm = args["--norm"]
    tsvd = 0.999 if args["--tsvd"] else None
    win_fmt = args["--win-fmt"]
    if win_fmt not in ["npy", "json"]:
        sys.stderr.write("Error: --win-fmt must be either json or npy.\n")
        sys.exit(1)
    win_size = args["--win-size"]
    if win_size != "auto":
        win_size = int(win_size)
    subsample = args["--subsample"]
    # If prefix involves a directory, crash if it does not exist
    cio.check_prefix_dir(prefix)
    # Load 6 cols from 2D BED file and infer header
    bed2d = cio.load_bed2d(bed2d_path)
    # Warn user if --inter is disabled but list contains inter patterns
    if not inter and len(bed2d.start1[bed2d.chrom1 != bed2d.chrom2]) > 0:
        sys.stderr.write(
            "Warning: The bed2d file contains interchromosomal patterns. "
            "These patterns will not be scanned unless --inter is used.\n"
        )
    if kernel_config_path is not None:
        custom = True
        # Loading input path as config
        config_path = kernel_config_path
    else:
        custom = False
        # Will use a preset config file matching pattern name
        config_path = pattern
    cfg = cio.load_kernel_config(config_path, custom)
    # Subsample Hi-C contacts from the matrix, if requested
    if subsample == "no":
        subsample = None
    # Instantiate and preprocess contact map
    hic_genome = HicGenome(
        mat_path, inter=inter, kernel_config=cfg, sample=subsample
    )
    # enforce max scanning distance to pattern at longest distance
    furthest = np.max(bed2d.start2 - bed2d.start1)
    max_diag = hic_genome.clr.shape[0] * hic_genome.clr.binsize
    cfg["max_dist"] = min(furthest, max_diag)
    cfg["min_dist"] = 0
    cfg["tsvd"] = tsvd
    cfg = _override_kernel_config("max_perc_zero", perc_zero, float, cfg)
    cfg = _override_kernel_config(
        "max_perc_undetected", perc_undetected, float, cfg
    )

    # Notify contact map instance of changes in scanning distance
    hic_genome.kernel_config = cfg
    # Normalize (balance) matrix using ICE
    hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads)
    # Initialize output structures
    bed2d["score"] = np.nan
    bed2d["pvalue"] = np.nan
    positions = bed2d.copy()
    # Only resize kernel matrix if explicitely requested
    km, kn = cfg["kernels"][0].shape
    n_kernels = len(cfg['kernels'])
    if win_size != "auto":
        if not win_size % 2:
            raise ValueError("--win-size must be odd")
        for i, k in enumerate(cfg["kernels"]):
            cfg["kernels"][i] = resize_kernel(k, factor=win_size / km)
        km = kn = win_size
        # Update kernel config after resizing kernels
        hic_genome.kernel_config = cfg
    # Define how many diagonals should be used in intra-matrices
    hic_genome.compute_max_dist()
    # Split whole genome matrix into intra- and inter- sub matrices. Each sub
    # matrix is processed on the fly (obs / exp, trimming diagonals > max dist)
    hic_genome.make_sub_matrices()
    windows = np.full((positions.shape[0], km, kn), np.nan)
    # We will store a copy of coordinates for each kernel
    bed2d_out = [bed2d.copy() for _ in range(n_kernels)]
    windows_out = [windows.copy() for _ in range(n_kernels)]
    # For each position, we use the center of the BED interval
    positions["pos1"] = (positions.start1 + positions.end1) // 2
    positions["pos2"] = (positions.start2 + positions.end2) // 2
    # Use each kernel matrix available for the pattern
    for kernel_id, kernel_matrix in enumerate(cfg["kernels"]):
        cio.progress(kernel_id, len(cfg["kernels"]), f"Kernel: {kernel_id}\n")
        n_sub_mats = hic_genome.sub_mats.shape[0]
        # Retrieve input positions for each submatrix and convert
        # coordinates from whole genome to submatrix.
        sub_pos = [
            _get_chrom_pos(positions, hic_genome, m[1].chr1, m[1].chr2)
            for m in hic_genome.sub_mats.iterrows()
        ]
        # Apply quantification procedure to all sub matrices in parallel
        sub_mat_data = zip(
            hic_genome.sub_mats.iterrows(),
            [cfg for _ in range(n_sub_mats)],
            [kernel_matrix for _ in range(n_sub_mats)],
            [s[1] for s in sub_pos],
        )
        # Run quantification in parallel on different sub matrices,
        # and show progress when gathering results
        sub_mat_results = []
        # Run in multiprocessing subprocesses
        if threads > 1:
            pool = mp.Pool(threads)
            dispatcher = pool.imap(_quantify_sub_mat, sub_mat_data, 1)
        else:
            dispatcher = map(_quantify_sub_mat, sub_mat_data)
        for s, result in enumerate(dispatcher):
            cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}")
            sub_mat_results.append(result)

        for i, r in enumerate(sub_mat_results):
            # If there were no patterns on that sub matrix, just skip it
            if r['coords'] is None:
                continue
            sub_pat_idx = sub_pos[i][0]

            # For each coordinate, keep the highest coefficient
            # among all kernels.
            try:
                bed2d_out[kernel_id]['score'][sub_pat_idx] = r['coords'].score.values
                bed2d_out[kernel_id]["pvalue"][sub_pat_idx] = r["coords"].pvalue.values
                windows_out[kernel_id][sub_pat_idx, :, :] = r["windows"]
            # Do nothing if no pattern was detected or matrix
            # is smaller than the kernel (-> patterns is None)
            except AttributeError:
                pass
    # Select the best score for each coordinate (among the different kernels)
    bed2d = pd.concat(bed2d_out, axis=0).reset_index(drop=True)
    windows = np.concatenate(windows_out, axis=0)
    bed2d = (
        bed2d
        .sort_values('score', ascending=True)
        .groupby(['chrom1', 'start1', 'chrom2', 'start2'], sort=False)
        .tail(1)
    )
    windows = windows[bed2d.index, :, :]
    bed2d = bed2d.reset_index(drop=True)
    bed2d["bin1"] = hic_genome.coords_to_bins(
        bed2d.loc[:, ["chrom1", "start1"]].rename(
            columns={"chrom1": "chrom", "start1": "pos"}
        )
    )
    bed2d["bin2"] = hic_genome.coords_to_bins(
        bed2d.loc[:, ["chrom2", "start2"]].rename(
            columns={"chrom2": "chrom", "start2": "pos"}
        )
    )
    bed2d["qvalue"] = fdr_correction(bed2d["pvalue"])
    bed2d = bed2d.loc[
        :,
        [
            "chrom1",
            "start1",
            "end1",
            "chrom2",
            "start2",
            "end2",
            "bin1",
            "bin2",
            "score",
            "pvalue",
            "qvalue",
        ],
    ]
    # Set p-values of invalid scores to nan
    bed2d.loc[np.isnan(bed2d.score), "pvalue"] = np.nan
    bed2d.loc[np.isnan(bed2d.score), "qvalue"] = np.nan
    # Sort by whole genome coordinates to match input order
    bed2d = (
        bed2d
        .sort_values(['bin1', 'bin2'], ascending=True)
        .reset_index(drop=True)
    )
    cio.write_patterns(bed2d, prefix)
    cio.save_windows(windows, prefix, fmt=win_fmt)
    # Generate pileup visualisations if requested
    if plotting_enabled:
        # Compute and plot pileup
        pileup_title = ("pileup_of_{n}_{pattern}").format(
            pattern=cfg["name"], n=windows.shape[0]
        )
        windows_pileup = cid.pileup_patterns(windows)
        # Symmetrize pileup for diagonal patterns
        if not cfg["max_dist"]:
            # Replace nan below diag by 0
            windows_pileup = np.nan_to_num(windows_pileup)
            # Add transpose
            windows_pileup += np.transpose(windows_pileup) - np.diag(
                np.diag(windows_pileup)
            )
        sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n")
        pileup_plot(windows_pileup, prefix, name=pileup_title)
Exemplo n.º 8
0
from chromosight.utils.io import load_kernel_config
import pathlib
import sys
from os.path import basename

# Here, each pattern config file detected in the kernels directory is loaded and
# made available as a variable in the API

# Get parent module (chromosight.kernels)
current_module = sys.modules[__name__]
# Find all json files in kernel dir
kernel_dir = pathlib.Path(__file__).parents[0]
for kernel_file in kernel_dir.glob("*.json"):
    # Get pattern names based on config file name
    pattern_name = str(kernel_file.with_suffix("").name)
    # Declare pattern configs as module level (chromosight.kernels) variables
    setattr(
        current_module,
        pattern_name,
        load_kernel_config(pattern_name, custom=False),
    )