def load_ranges(modisco_dir): modisco_dir = Path(modisco_dir) included_samples = load_included_samples(modisco_dir) kwargs = read_json(modisco_dir / "modisco-run.kwargs.json") d = ContribFile(kwargs["contrib_file"], included_samples) df = d.get_ranges() d.close() return df
def from_modisco_dir(cls, modisco_dir, ignore_include_samples=False): from bpnet.cli.modisco import load_included_samples, load_contrib_type from bpnet.utils import read_json if ignore_include_samples: include_samples = None else: include_samples = load_included_samples(modisco_dir) if include_samples.all(): # All are true, we can ignore that include_samples = None modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) return cls(modisco_kwargs["contrib_file"], include_samples, default_contrib_score=contrib_type)
def bpnet_contrib( model_dir, output_file, method="grad", dataspec=None, regions=None, fasta_file=None, # alternative to dataspec shuffle_seq=False, shuffle_regions=False, max_regions=None, # reference='zeroes', # Currently the only option # peak_width=1000, # automatically inferred from 'config.gin.json' # seq_width=None, contrib_wildcard='*/profile/wn,*/counts/pre-act', # specifies which contrib. scores to compute batch_size=512, gpu=0, memfrac_gpu=0.45, num_workers=10, storage_chunk_size=512, exclude_chr='', include_chr='', overwrite=False, skip_bias=False): """Run contribution scores for a BPNet model """ from bpnet.extractors import _chrom_sizes add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib') if gpu is not None: create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' if os.path.exists(output_file): if overwrite: os.remove(output_file) else: raise ValueError( f"File exists {output_file}. Use overwrite=True to overwrite it" ) config = read_json(os.path.join(model_dir, 'config.gin.json')) seq_width = config['seq_width'] peak_width = config['seq_width'] # NOTE - seq_width has to be the same for the input and the target # # infer from the command line # if seq_width is None: # logger.info("Using seq_width = peak_width") # seq_width = peak_width # # make sure these are int's # seq_width = int(seq_width) # peak_width = int(peak_width) # Split contrib_wildcards = contrib_wildcard.split(",") # Allow chr inclusion / exclusion if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = None if include_chr: include_chr = include_chr.split(",") else: include_chr = None logger.info("Loading the config files") model_dir = Path(model_dir) logger.info("Creating the dataset") from bpnet.datasets import StrandedProfile, SeqClassification if fasta_file is not None: if regions is None: raise ValueError( "fasta_file specified. Expecting regions to be specified as well" ) dl_valid = SeqClassification( fasta_file=fasta_file, intervals_file=regions, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, auto_resize_len=seq_width, ) chrom_sizes = _chrom_sizes(fasta_file) else: if dataspec is None: logger.info("Using dataspec used to train the model") # Specify dataspec dataspec = model_dir / "dataspec.yml" ds = DataSpec.load(dataspec) dl_valid = StrandedProfile(ds, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, intervals_file=regions, peak_width=peak_width, shuffle=False, seq_width=seq_width) chrom_sizes = _chrom_sizes(ds.fasta_file) # Setup contribution score trimming (not required currently) if seq_width > peak_width: # Trim # make sure we can nicely trim the peak logger.info("Trimming the output") assert (seq_width - peak_width) % 2 == 0 trim_start = (seq_width - peak_width) // 2 trim_end = seq_width - trim_start assert trim_end - trim_start == peak_width elif seq_width == peak_width: trim_start = 0 trim_end = peak_width else: raise ValueError("seq_width < peak_width") seqmodel = SeqModel.from_mdir(model_dir) # get all possible interpretation names # make sure they match the specified glob intp_names = [ name for name, _ in seqmodel.get_intp_tensors(preact_only=False) if fnmatch_any(name, contrib_wildcards) ] logger.info(f"Using the following interpretation targets:") for n in intp_names: print(n) if max_regions is not None: if len(dl_valid) > max_regions: logging.info( f"Using {max_regions} regions instead of the original {len(dl_valid)}" ) else: logging.info( f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. " "Using the dataset size for max-regions") max_regions = len(dl_valid) else: max_regions = len(dl_valid) max_batches = np.ceil(max_regions / batch_size) writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size) for i, batch in enumerate( tqdm(dl_valid.batch_iter(batch_size=batch_size, shuffle=shuffle_regions, num_workers=num_workers), total=max_batches)): # store the original batch containing 'inputs' and 'targets' if skip_bias: batch['inputs'] = { 'seq': batch['inputs']['seq'] } # ignore all other inputs if max_batches > 0: if i > max_batches: break if shuffle_seq: # Di-nucleotide shuffle the sequences batch['inputs']['seq'] = onehot_dinucl_shuffle( batch['inputs']['seq']) for name in intp_names: hyp_contrib = seqmodel.contrib_score( batch['inputs']['seq'], name=name, method=method, batch_size=None) # don't second-batch # put contribution scores to the dictionary # also trim the contribution scores appropriately so that # the output will always be w.r.t. the peak center batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end] # trim the sequence as well # Trim the sequence batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end] # ? maybe it would it be better to have an explicit ContribFileWriter. # that way the written schema would be fixed writer.batch_write(batch) # add chromosome sizes writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes) writer.close() logger.info(f"Done. Contribution score file was saved to: {output_file}")
def chip_nexus_analysis(modisco_dir, trim_frac=0.08, num_workers=20, run_cwm_scan=False, force=False, footprint_width=200): """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs: - modisco_plot - modisco_report - modisco_table - modisco_export_patterns - cwm_scan - modisco_export_seqlets Note: All the sub-commands are only executed if they have not been ran before. Use --force override this. Whether the commands have been run before is deterimined by checking if the following file exists: `{modisco_dir}/.modisco_report_all/{command}.done`. """ plt.switch_backend('agg') from bpnet.utils import ConditionalRun modisco_dir = Path(modisco_dir) # figure out the contribution scores used kwargs = read_json(modisco_dir / "modisco-run.kwargs.json") contrib_scores = kwargs["contrib_file"] mf = ModiscoFile(f"{modisco_dir}/modisco.h5") all_patterns = mf.pattern_names() mf.close() if len(all_patterns) == 0: print("No patterns found.") # Touch modisco-chip.html for snakemake open(modisco_dir / 'modisco-chip.html', 'a').close() open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close() return # class determining whether to run the command or not (poor-man's snakemake) cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force) sync = [] # -------------------------------------------- if (not cr.set_cmd('modisco_plot').done() or not cr.set_cmd('modisco_enrich_patterns').done()): # load ContribFile and pass it to all the functions logger.info("Loading ContribFile") contribsf = ContribFile.from_modisco_dir(modisco_dir) contribsf.cache() else: contribsf = None # -------------------------------------------- # Basic reports if not cr.set_cmd('modisco_plot').done(): modisco_plot(modisco_dir, modisco_dir / 'plots', heatmap_width=footprint_width, figsize=(10, 10), contribsf=contribsf) cr.write() sync.append("plots") if not cr.set_cmd('modisco_report').done(): modisco_report(str(modisco_dir), str(modisco_dir)) cr.write() sync.append("modisco-chip.html") if not cr.set_cmd('modisco_table').done(): modisco_table(modisco_dir, contrib_scores, modisco_dir, report_url=None, contribsf=contribsf, footprint_width=footprint_width) cr.write() sync.append("footprints.pkl") sync.append("pattern_table.*") if not cr.set_cmd('modisco_export_patterns').done(): modisco_export_patterns(modisco_dir, output_file=modisco_dir / 'patterns.pkl', contribsf=contribsf) cr.write() sync.append("patterns.pkl") # -------------------------------------------- # Finding new instances if run_cwm_scan: if not cr.set_cmd('cwm_scan').done(): cwm_scan(modisco_dir, modisco_dir / 'instances.bed.gz', trim_frac=trim_frac, contrib_file=None, num_workers=num_workers) cr.write() # -------------------------------------------- # Export bed-files and bigwigs # Seqlets if not cr.set_cmd('modisco_export_seqlets').done(): modisco_export_seqlets(str(modisco_dir), str(modisco_dir / 'seqlets'), trim_frac=trim_frac) cr.write() sync.append("seqlets") # print the rsync command to run in order to sync the output # directories to the webserver logger.info("Run the following command to sync files to the webserver") dirs = " ".join(sync) print(f"rsync -av --progress {dirs} <output_dir>/")
def cwm_scan(modisco_dir, output_file, trim_frac=0.08, patterns='all', filters='match_weighted_p>=.2,contrib_weighted_p>=.01', contrib_file=None, add_profile_features=False, num_workers=10): """Get motif instances via CWM scanning. """ from bpnet.modisco.utils import longer_pattern, shorten_pattern from bpnet.modisco.pattern_instances import annotate_profile_single add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan') modisco_dir = Path(modisco_dir) valid_suffixes = [ '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.parq', '.bed', '.bed.gz', ] if not any([output_file.endswith(suffix) for suffix in valid_suffixes]): raise ValueError( f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}" ) # Centroid matches path cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz' # save the hyper-parameters kwargs_json_file = os.path.join(os.path.dirname(output_file), 'cwm-scan.kwargs.json') write_json( dict(modisco_dir=os.path.abspath(str(contrib_file)), output_file=str(output_file), cwm_scan_seqlets_path=str(cm_path), trim_frac=trim_frac, patterns=patterns, filters=filters, contrib_file=contrib_file, add_profile_features=add_profile_features, num_workers=num_workers), str(kwargs_json_file)) # figure out contrib_wildcard modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) mf = ModiscoFile(modisco_dir / "modisco.h5") tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] logger.info(f"Using tasks: {tasks}") if contrib_file is None: cf = ContribFile.from_modisco_dir(modisco_dir) cf.cache( ) # cache it since it can be re-used in `modisco_centroid_seqlet_matches` else: logger.info(f"Loading the contribution scores from: {contrib_file}") cf = ContribFile(contrib_file, default_contrib_score=contrib_type) if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") cwm_scan_seqlets(modisco_dir, output_file=cm_path, trim_frac=trim_frac, contribsf=cf if contrib_file is None else None, num_workers=num_workers, verbose=False) else: logger.info("Centroid matches already exist.") logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) # get the raw data seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges() logger.info("Scanning for patterns") dfl = [] # patterns to scan. `longer_pattern` makes sure the patterns are in the long format scan_patterns = patterns.split( ",") if patterns is not 'all' else mf.pattern_names() scan_patterns = [longer_pattern(pn) for pn in scan_patterns] if add_profile_features: profile = cf.get_profiles() logger.info("Profile features will also be added to dfi") for pattern_name in tqdm(mf.pattern_names()): if pattern_name not in scan_patterns: # skip scanning that patterns continue pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, contribution = pattern.scan_contribution(contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False) dfm = pattern.get_instances( tasks, match, contribution, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) for filt in filters.split(","): if len(filt) > 0: dfm = dfm.query(filt) if add_profile_features: dfm = annotate_profile_single(dfm, pattern_name, mf, profile, profile_width=70, trim_frac=trim_frac) dfm['pattern_short'] = shorten_pattern(pattern_name) # TODO - is it possible to write out the results incrementally? dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') # add the absolute coordinates dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start'] dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end'] logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...) bed_columns = [ 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern', 'contrib_weighted_p', 'strand', 'match_weighted_p' ] dfp = pd_first_cols(dfp, bed_columns) # write to a parquet file if output_file.endswith(".parq"): logger.info("Writing a parquet file") dfp.to_parquet(output_file, partition_on=['pattern_short'], engine='fastparquet') elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"): logger.info("Writing a csv file") dfp.to_csv(output_file, compression='infer', index=False) elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"): logger.info("Writing a tsv file") dfp.to_csv(output_file, sep='\t', compression='infer', index=False) elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"): logger.info("Writing a BED file") # write only the first (and main) 7 columns dfp[bed_columns].to_csv(output_file, sep='\t', compression='infer', index=False, header=False) else: logger.warn("File suffix not recognized. Using .csv.gz file format") dfp.to_csv(output_file, compression='gzip', index=False) logger.info("Done!")