def test_write_patterns(self): """Test if pattern coordinates are saved to disk as expected.""" # Generate dummy pattern list res, n_patterns = 5000, 100000 chrom_names = ["c1", "c2", "c3"] bins_per_chrom = [ n_patterns // 3, n_patterns // 3, n_patterns // 3 + n_patterns % 3, ] tmp_coords = pd.DataFrame( { "chr1": np.repeat(chrom_names, bins_per_chrom), "start1": range(0, res * (n_patterns), res), "end1": range(res, res * (n_patterns + 1), res), "chr2": np.repeat(chrom_names, bins_per_chrom), "start2": range(0, res * (n_patterns), res), "end2": range(res, res * (n_patterns + 1), res), "bin1": range(n_patterns), "bin2": range(1, n_patterns + 1), "kernel_id": 0, "iteration": 0, "score": np.random.randint(0, 100, n_patterns), } ) for dec in range(1, 5): cio.write_patterns( tmp_coords, self.tmp_file, self.tmp_dir, dec=dec ) obs_coords = pd.read_csv(self.tmp_path + ".txt", sep="\t") assert obs_coords.shape == tmp_coords.shape assert np.all( np.isclose(obs_coords.score, np.round(tmp_coords.score, dec)) ) os.unlink(self.tmp_path + ".txt")
def cmd_detect(arguments): # Parse command line arguments for detect kernel_config_path = arguments["--kernel-config"] dump = arguments["--dump"] interchrom = arguments["--inter"] iterations = arguments["--iterations"] mat_path = arguments["<contact_map>"] max_dist = arguments["--max-dist"] min_dist = arguments["--min-dist"] min_separation = arguments["--min-separation"] n_mads = float(arguments["--n-mads"]) pattern = arguments["--pattern"] perc_undetected = arguments["--perc-undetected"] precision = arguments["--precision"] resize = arguments["--resize-kernel"] threads = arguments["--threads"] output = arguments["<output>"] win_fmt = arguments["--win-fmt"] subsample = arguments["--subsample"] if subsample == "no": subsample = None plotting_enabled = False if arguments["--no-plotting"] else True smooth_trend = arguments["--smooth-trend"] if smooth_trend is None: smooth_trend = False # If output is not specified, use current directory if not output: output = pathlib.Path() else: output = pathlib.Path(output) output.mkdir(exist_ok=True) if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) # Read a user-provided kernel config if custom is true # Else, load a preset kernel config for input pattern # Configs are JSON files containing all parameter associated with the pattern # They are loaded into a dictionary in the form : # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...} # Where each kernel is a 2D numpy array representing the pattern if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern ### 0: LOAD INPUT params = { "max_iterations": (iterations, int), "precision": (precision, float), "max_dist": (max_dist, int), "min_dist": (min_dist, int), "min_separation": (min_separation, int), "max_perc_undetected": (perc_undetected, float), } kernel_config = cio.load_kernel_config(config_path, custom) for param_name, (param_value, param_type) in params.items(): kernel_config = _override_kernel_config( param_name, param_value, param_type, kernel_config ) # NOTE: Temporary warning if interchrom: sys.stderr.write( "WARNING: Detection on interchromosomal matrices is expensive in RAM\n" ) hic_genome = HicGenome( mat_path, inter=interchrom, kernel_config=kernel_config, dump=dump, smooth=smooth_trend, ) ### 1: Process input signal # Adapt size of kernel matrices based on the signal resolution if resize: for i, mat in enumerate(kernel_config["kernels"]): kernel_config["kernels"][i] = resize_kernel( mat, kernel_res=kernel_config["resolution"], signal_res=hic_genome.resolution, ) hic_genome.kernel_config = kernel_config # Subsample Hi-C contacts from the matrix, if requested # NOTE: Subsampling has to be done before normalisation hic_genome.subsample(subsample) # Normalize (balance) matrix using ICE hic_genome.normalize(n_mads=n_mads) # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() all_pattern_coords = [] all_pattern_windows = [] ### 2: DETECTION ON EACH SUBMATRIX pool = mp.Pool(int(threads)) n_sub_mats = hic_genome.sub_mats.shape[0] # Loop over the different kernel matrices for input pattern run_id = 0 total_runs = ( len(kernel_config["kernels"]) * kernel_config["max_iterations"] ) sys.stderr.write("Detecting patterns...\n") for kernel_id, kernel_matrix in enumerate(kernel_config["kernels"]): # Adjust kernel iteratively for i in range(kernel_config["max_iterations"]): cio.progress( run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n" ) # Apply detection procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [kernel_config for i in range(n_sub_mats)], [kernel_matrix for i in range(n_sub_mats)], [dump for i in range(n_sub_mats)], ) # Run detection in parallel on different sub matrices, and show progress when # gathering results sub_mat_results = [] for i, result in enumerate(pool.imap_unordered(_detect_sub_mat, sub_mat_data, 1)): chr1 = hic_genome.sub_mats.chr1[i] chr2 = hic_genome.sub_mats.chr2[i] cio.progress(i, n_sub_mats, f"{chr1}-{chr2}") sub_mat_results.append(result) #sub_mat_results = map(_detect_sub_mat, sub_mat_data) # Convert coordinates from chromosome to whole genome bins kernel_coords = [ hic_genome.get_full_mat_pattern( d["chr1"], d["chr2"], d["coords"] ) for d in sub_mat_results if d["coords"] is not None ] # Gather newly detected pattern coordinates try: # Extract surrounding windows for each sub_matrix kernel_windows = np.concatenate( [ w["windows"] for w in sub_mat_results if w["windows"] is not None ], axis=0, ) all_pattern_coords.append( pd.concat(kernel_coords, axis=0).reset_index(drop=True) ) # Add info about kernel and iteration which detected these patterns all_pattern_coords[-1]["kernel_id"] = kernel_id all_pattern_coords[-1]["iteration"] = i all_pattern_windows.append(kernel_windows) # If no pattern was found with this kernel # skip directly to the next one, skipping iterations except ValueError: break # Update kernel with patterns detected at current iteration kernel_matrix = cid.pileup_patterns(kernel_windows) run_id += 1 cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n") # If no pattern detected on any chromosome, with any kernel, exit gracefully if len(all_pattern_coords) == 0: sys.stderr.write("No pattern detected ! Exiting.\n") sys.exit(0) # Combine patterns of all kernel matrices into a single array all_pattern_coords = pd.concat(all_pattern_coords, axis=0).reset_index( drop=True ) # Combine all windows from different kernels into a single pile of windows all_pattern_windows = np.concatenate(all_pattern_windows, axis=0) # Compute minimum separation in bins and make sure it has a reasonable value separation_bins = int( kernel_config["min_separation"] // hic_genome.resolution ) if separation_bins < 1: separation_bins = 1 print(f"Minimum pattern separation is : {separation_bins}") # Remove patterns with overlapping windows (smeared patterns) distinct_patterns = cid.remove_neighbours( all_pattern_coords, win_size=separation_bins ) # Drop patterns that are too close to each other all_pattern_coords = all_pattern_coords.loc[distinct_patterns, :] all_pattern_windows = all_pattern_windows[distinct_patterns, :, :] # Get from bins into basepair coordinates coords_1 = hic_genome.bins_to_coords(all_pattern_coords.bin1).reset_index( drop=True ) coords_1.columns = [str(col) + "1" for col in coords_1.columns] coords_2 = hic_genome.bins_to_coords(all_pattern_coords.bin2).reset_index( drop=True ) coords_2.columns = [str(col) + "2" for col in coords_2.columns] all_pattern_coords = pd.concat( [all_pattern_coords.reset_index(drop=True), coords_1, coords_2], axis=1 ) # Filter patterns closer than minimum distance from the diagonal if any min_dist_drop_mask = ( all_pattern_coords.chrom1 == all_pattern_coords.chrom2 ) & ( np.abs(all_pattern_coords.start2 - all_pattern_coords.start1) < int(kernel_config["min_dist"]) ) # Reorder columns at the same time all_pattern_coords = all_pattern_coords.loc[ ~min_dist_drop_mask, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "kernel_id", "iteration", "score", ], ] all_pattern_windows = all_pattern_windows[~min_dist_drop_mask, :, :] ### 3: WRITE OUTPUT sys.stderr.write(f"{all_pattern_coords.shape[0]} patterns detected\n") # Save patterns and their coordinates in a tsv file cio.write_patterns( all_pattern_coords, kernel_config["name"] + "_out", output ) # Save windows as an array in an npy file cio.save_windows( all_pattern_windows, kernel_config["name"] + "_out", output, format=win_fmt, ) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_fname = ("pileup_of_{n}_{pattern}").format( pattern=kernel_config["name"], n=all_pattern_windows.shape[0] ) windows_pileup = cid.pileup_patterns(all_pattern_windows) pileup_plot(windows_pileup, name=pileup_fname, output=output)
def cmd_detect(args): # Parse command line arguments for detect dump = args["--dump"] norm = args["--norm"] interchrom = args["--inter"] iterations = args["--iterations"] kernel_config_path = args["--kernel-config"] mat_path = args["<contact_map>"] max_dist = args["--max-dist"] min_dist = args["--min-dist"] min_separation = args["--min-separation"] n_mads = float(args["--n-mads"]) prefix = args["<prefix>"] pattern = args["--pattern"] pearson = args["--pearson"] perc_zero = args["--perc-zero"] perc_undetected = args["--perc-undetected"] subsample = args["--subsample"] threads = int(args["--threads"]) tsvd = 0.999 if args["--tsvd"] else None win_fmt = args["--win-fmt"] win_size = args["--win-size"] if subsample == "no": subsample = None plotting_enabled = False if args["--no-plotting"] else True smooth_trend = args["--smooth-trend"] if smooth_trend is None: smooth_trend = False # If prefix involves a directory, crash if it does not exist cio.check_prefix_dir(prefix) if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) # Read a user-provided kernel config if custom is true # Else, load a preset kernel config for input pattern # Configs are JSON files containing all parameter associated with the pattern # They are loaded into a dictionary in the form : # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...} # Where each kernel is a 2D numpy array representing the pattern if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern ### 0: LOAD INPUT params = { "max_iterations": (iterations, int), "pearson": (pearson, float), "max_dist": (max_dist, int), "min_dist": (min_dist, int), "min_separation": (min_separation, int), "max_perc_undetected": (perc_undetected, float), "max_perc_zero": (perc_zero, float), } cfg = cio.load_kernel_config(config_path, custom) for param_name, (param_value, param_type) in params.items(): cfg = _override_kernel_config(param_name, param_value, param_type, cfg) # Resize kernels if requested if win_size != "auto": win_size = int(win_size) if not win_size % 2: raise ValueError("--win-size must be odd") resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0]) cfg["kernels"] = [resize(k) for k in cfg["kernels"]] if interchrom: sys.stderr.write( "WARNING: Detection on interchromosomal matrices is expensive in RAM\n" ) hic_genome = HicGenome( mat_path, inter=interchrom, kernel_config=cfg, dump=dump, smooth=smooth_trend, sample=subsample, ) ### 1: Process input signal hic_genome.kernel_config = cfg # Normalize (balance) matrix using ICE hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads) # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() all_coords = [] all_windows = [] ### 2: DETECTION ON EACH SUBMATRIX n_sub_mats = hic_genome.sub_mats.shape[0] # Loop over the different kernel matrices for input pattern run_id = 0 # Use cfg to inform jobs whether they should run full convolution cfg["tsvd"] = tsvd total_runs = len(cfg["kernels"]) * cfg["max_iterations"] sys.stderr.write("Detecting patterns...\n") for kernel_id, kernel_matrix in enumerate(cfg["kernels"]): # Adjust kernel iteratively for i in range(cfg["max_iterations"]): cio.progress( run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n" ) # Apply detection procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [cfg for i in range(n_sub_mats)], [kernel_matrix for i in range(n_sub_mats)], [dump for i in range(n_sub_mats)], ) # Run detection in parallel on different sub matrices, and show progress when # gathering results sub_mat_results = [] # Run in multiprocessing subprocesses if threads > 1: pool = mp.Pool(threads) dispatcher = pool.imap(_detect_sub_mat, sub_mat_data, 1) else: dispatcher = map(_detect_sub_mat, sub_mat_data) for s, result in enumerate(dispatcher): cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}") sub_mat_results.append(result) # Convert coordinates from chromosome to whole genome bins kernel_coords = [ hic_genome.get_full_mat_pattern( d["chr1"], d["chr2"], d["coords"] ) for d in sub_mat_results if d["coords"] is not None ] # Gather newly detected pattern coordinates try: # Extract surrounding windows for each sub_matrix kernel_windows = np.concatenate( [ w["windows"] for w in sub_mat_results if w["windows"] is not None ], axis=0, ) all_coords.append( pd.concat(kernel_coords, axis=0).reset_index(drop=True) ) # Add info about kernel and iteration which detected these patterns all_coords[-1]["kernel_id"] = kernel_id all_coords[-1]["iteration"] = i all_windows.append(kernel_windows) # If no pattern was found with this kernel # skip directly to the next one, skipping iterations except ValueError: break # Update kernel with patterns detected at current iteration kernel_matrix = cid.pileup_patterns(kernel_windows) run_id += 1 cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n") # If no pattern detected on any chromosome, with any kernel, exit gracefully if len(all_coords) == 0: sys.stderr.write("No pattern detected ! Exiting.\n") sys.exit(0) # Finish parallelized part if threads > 1: pool.close() # Combine patterns of all kernel matrices into a single array all_coords = pd.concat(all_coords, axis=0).reset_index(drop=True) # Combine all windows from different kernels into a single pile of windows all_windows = np.concatenate(all_windows, axis=0) # Compute minimum separation in bins and make sure it has a reasonable value separation_bins = int(cfg["min_separation"] // hic_genome.clr.binsize) if separation_bins < 1: separation_bins = 1 print(f"Minimum pattern separation is : {separation_bins}") # Remove patterns with overlapping windows (smeared patterns) distinct_patterns = cid.remove_neighbours( all_coords, win_size=separation_bins ) # Drop patterns that are too close to each other all_coords = all_coords.loc[distinct_patterns, :] all_windows = all_windows[distinct_patterns, :, :] # Get from bins into basepair coordinates coords_1 = hic_genome.bins_to_coords(all_coords.bin1).reset_index( drop=True ) coords_1.columns = [str(col) + "1" for col in coords_1.columns] coords_2 = hic_genome.bins_to_coords(all_coords.bin2).reset_index( drop=True ) coords_2.columns = [str(col) + "2" for col in coords_2.columns] all_coords = pd.concat( [all_coords.reset_index(drop=True), coords_1, coords_2], axis=1 ) # Filter patterns closer than minimum distance from the diagonal if any min_dist_drop_mask = (all_coords.chrom1 == all_coords.chrom2) & ( np.abs(all_coords.start2 - all_coords.start1) < cfg["min_dist"] ) all_coords = all_coords.loc[~min_dist_drop_mask, :] all_windows = all_windows[~min_dist_drop_mask, :, :] del min_dist_drop_mask # Remove patterns with nan p-values (no contact in window) pval_mask = all_coords.pvalue.isnull() all_coords = all_coords.loc[~pval_mask, :] all_windows = all_windows[~pval_mask, :, :] del pval_mask # Correct p-values for multiple testing using FDR all_coords["qvalue"] = fdr_correction(all_coords["pvalue"]) # Reorder columns all_coords = all_coords.loc[ :, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "kernel_id", "iteration", "score", "pvalue", "qvalue", ], ] ### 3: WRITE OUTPUT sys.stderr.write(f"{all_coords.shape[0]} patterns detected\n") # Save patterns and their coordinates in a tsv file sys.stderr.write(f"Saving patterns in {prefix}.tsv\n") cio.write_patterns(all_coords, prefix) # Save windows as an array in an npy file sys.stderr.write(f"Saving patterns in {prefix}.{win_fmt}\n") cio.save_windows(all_windows, prefix, fmt=win_fmt) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_title = ("Pileup of {n} {pattern}").format( pattern=cfg["name"], n=all_windows.shape[0] ) windows_pileup = cid.pileup_patterns(all_windows) # Symmetrize pileup for diagonal patterns if not cfg["max_dist"]: # Replace nan below diag by 0 windows_pileup = np.nan_to_num(windows_pileup) # Add transpose windows_pileup += np.transpose(windows_pileup) - np.diag( np.diag(windows_pileup) ) sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n") pileup_plot(windows_pileup, prefix, name=pileup_title)
def cmd_quantify(args): bed2d_path = args["<bed2d>"] mat_path = args["<contact_map>"] prefix = args["<prefix>"] n_mads = float(args["--n-mads"]) pattern = args["--pattern"] inter = args["--inter"] kernel_config_path = args["--kernel-config"] perc_zero = args["--perc-zero"] perc_undetected = args["--perc-undetected"] plotting_enabled = False if args["--no-plotting"] else True threads = int(args["--threads"]) norm = args["--norm"] tsvd = 0.999 if args["--tsvd"] else None win_fmt = args["--win-fmt"] if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) win_size = args["--win-size"] if win_size != "auto": win_size = int(win_size) subsample = args["--subsample"] # If prefix involves a directory, crash if it does not exist cio.check_prefix_dir(prefix) # Load 6 cols from 2D BED file and infer header bed2d = cio.load_bed2d(bed2d_path) # Warn user if --inter is disabled but list contains inter patterns if not inter and len(bed2d.start1[bed2d.chrom1 != bed2d.chrom2]) > 0: sys.stderr.write( "Warning: The bed2d file contains interchromosomal patterns. " "These patterns will not be scanned unless --inter is used.\n" ) if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern cfg = cio.load_kernel_config(config_path, custom) # Subsample Hi-C contacts from the matrix, if requested if subsample == "no": subsample = None # Instantiate and preprocess contact map hic_genome = HicGenome( mat_path, inter=inter, kernel_config=cfg, sample=subsample ) # enforce max scanning distance to pattern at longest distance furthest = np.max(bed2d.start2 - bed2d.start1) max_diag = hic_genome.clr.shape[0] * hic_genome.clr.binsize cfg["max_dist"] = min(furthest, max_diag) cfg["min_dist"] = 0 cfg["tsvd"] = tsvd cfg = _override_kernel_config("max_perc_zero", perc_zero, float, cfg) cfg = _override_kernel_config( "max_perc_undetected", perc_undetected, float, cfg ) # Notify contact map instance of changes in scanning distance hic_genome.kernel_config = cfg # Normalize (balance) matrix using ICE hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads) # Initialize output structures bed2d["score"] = np.nan bed2d["pvalue"] = np.nan positions = bed2d.copy() # Only resize kernel matrix if explicitely requested km, kn = cfg["kernels"][0].shape n_kernels = len(cfg['kernels']) if win_size != "auto": if not win_size % 2: raise ValueError("--win-size must be odd") for i, k in enumerate(cfg["kernels"]): cfg["kernels"][i] = resize_kernel(k, factor=win_size / km) km = kn = win_size # Update kernel config after resizing kernels hic_genome.kernel_config = cfg # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() windows = np.full((positions.shape[0], km, kn), np.nan) # We will store a copy of coordinates for each kernel bed2d_out = [bed2d.copy() for _ in range(n_kernels)] windows_out = [windows.copy() for _ in range(n_kernels)] # For each position, we use the center of the BED interval positions["pos1"] = (positions.start1 + positions.end1) // 2 positions["pos2"] = (positions.start2 + positions.end2) // 2 # Use each kernel matrix available for the pattern for kernel_id, kernel_matrix in enumerate(cfg["kernels"]): cio.progress(kernel_id, len(cfg["kernels"]), f"Kernel: {kernel_id}\n") n_sub_mats = hic_genome.sub_mats.shape[0] # Retrieve input positions for each submatrix and convert # coordinates from whole genome to submatrix. sub_pos = [ _get_chrom_pos(positions, hic_genome, m[1].chr1, m[1].chr2) for m in hic_genome.sub_mats.iterrows() ] # Apply quantification procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [cfg for _ in range(n_sub_mats)], [kernel_matrix for _ in range(n_sub_mats)], [s[1] for s in sub_pos], ) # Run quantification in parallel on different sub matrices, # and show progress when gathering results sub_mat_results = [] # Run in multiprocessing subprocesses if threads > 1: pool = mp.Pool(threads) dispatcher = pool.imap(_quantify_sub_mat, sub_mat_data, 1) else: dispatcher = map(_quantify_sub_mat, sub_mat_data) for s, result in enumerate(dispatcher): cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}") sub_mat_results.append(result) for i, r in enumerate(sub_mat_results): # If there were no patterns on that sub matrix, just skip it if r['coords'] is None: continue sub_pat_idx = sub_pos[i][0] # For each coordinate, keep the highest coefficient # among all kernels. try: bed2d_out[kernel_id]['score'][sub_pat_idx] = r['coords'].score.values bed2d_out[kernel_id]["pvalue"][sub_pat_idx] = r["coords"].pvalue.values windows_out[kernel_id][sub_pat_idx, :, :] = r["windows"] # Do nothing if no pattern was detected or matrix # is smaller than the kernel (-> patterns is None) except AttributeError: pass # Select the best score for each coordinate (among the different kernels) bed2d = pd.concat(bed2d_out, axis=0).reset_index(drop=True) windows = np.concatenate(windows_out, axis=0) bed2d = ( bed2d .sort_values('score', ascending=True) .groupby(['chrom1', 'start1', 'chrom2', 'start2'], sort=False) .tail(1) ) windows = windows[bed2d.index, :, :] bed2d = bed2d.reset_index(drop=True) bed2d["bin1"] = hic_genome.coords_to_bins( bed2d.loc[:, ["chrom1", "start1"]].rename( columns={"chrom1": "chrom", "start1": "pos"} ) ) bed2d["bin2"] = hic_genome.coords_to_bins( bed2d.loc[:, ["chrom2", "start2"]].rename( columns={"chrom2": "chrom", "start2": "pos"} ) ) bed2d["qvalue"] = fdr_correction(bed2d["pvalue"]) bed2d = bed2d.loc[ :, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "score", "pvalue", "qvalue", ], ] # Set p-values of invalid scores to nan bed2d.loc[np.isnan(bed2d.score), "pvalue"] = np.nan bed2d.loc[np.isnan(bed2d.score), "qvalue"] = np.nan # Sort by whole genome coordinates to match input order bed2d = ( bed2d .sort_values(['bin1', 'bin2'], ascending=True) .reset_index(drop=True) ) cio.write_patterns(bed2d, prefix) cio.save_windows(windows, prefix, fmt=win_fmt) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_title = ("pileup_of_{n}_{pattern}").format( pattern=cfg["name"], n=windows.shape[0] ) windows_pileup = cid.pileup_patterns(windows) # Symmetrize pileup for diagonal patterns if not cfg["max_dist"]: # Replace nan below diag by 0 windows_pileup = np.nan_to_num(windows_pileup) # Add transpose windows_pileup += np.transpose(windows_pileup) - np.diag( np.diag(windows_pileup) ) sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n") pileup_plot(windows_pileup, prefix, name=pileup_title)