def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.h5")) batch = prepare_batch(dl_batch, pred_batch_array) writer = HDF5BatchWriter(tmpfile, chunk_size=4) writer.batch_write(batch) writer.batch_write(batch) writer.close() with HDF5Reader(tmpfile) as f: assert np.all( list(f.batch_iter(2))[0]['metadata']['gene_id'] == dl_batch['metadata']['gene_id'][:2]) out = f.load_all() assert np.all(out['metadata']['gene_id'] == np.concatenate([ dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id'] ])) assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([ dl_batch['metadata']['ranges']['chr'], dl_batch['metadata'] ['ranges']['chr'] ])) assert np.all(out['metadata']['ranges']["start"] == np.concatenate([ dl_batch['metadata']['ranges']['start'], dl_batch['metadata'] ['ranges']['start'] ])) assert np.all(out['preds'][:3] == pred_batch_array)
def save(self, file_path, **kwargs): """Save the dataset to an hdf5 file """ obj = HDF5BatchWriter(file_path=file_path, **kwargs) obj.batch_write(self.data) # Store the attrs for k,v in self.attrs.items(): obj.f.attrs[k] = v obj.close()
def test_MultipleBatchWriter(dl_batch, pred_batch_array, tmpdir): tmpdir = tmpdir.mkdir("example") h5_tmpfile = str(tmpdir.join("out.h5")) tsv_tmpfile = str(tmpdir.join("out.tsv")) batch = prepare_batch(dl_batch, pred_batch_array) writer = MultipleBatchWriter( [TsvBatchWriter(tsv_tmpfile), HDF5BatchWriter(h5_tmpfile)]) writer.batch_write(batch) writer.batch_write(batch) writer.close() assert os.path.exists(h5_tmpfile) assert os.path.exists(tsv_tmpfile) df = pd.read_csv(tsv_tmpfile, sep="\t") assert set(list(df.columns)) == { 'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr', 'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id', 'preds/0', 'preds/1', 'preds/2' } assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]
def bpnet_contrib( model_dir, output_file, method="grad", dataspec=None, regions=None, fasta_file=None, # alternative to dataspec shuffle_seq=False, shuffle_regions=False, max_regions=None, # reference='zeroes', # Currently the only option # peak_width=1000, # automatically inferred from 'config.gin.json' # seq_width=None, contrib_wildcard='*/profile/wn,*/counts/pre-act', # specifies which contrib. scores to compute batch_size=512, gpu=0, memfrac_gpu=0.45, num_workers=10, storage_chunk_size=512, exclude_chr='', include_chr='', overwrite=False, skip_bias=False): """Run contribution scores for a BPNet model """ from bpnet.extractors import _chrom_sizes add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib') if gpu is not None: create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' if os.path.exists(output_file): if overwrite: os.remove(output_file) else: raise ValueError( f"File exists {output_file}. Use overwrite=True to overwrite it" ) config = read_json(os.path.join(model_dir, 'config.gin.json')) seq_width = config['seq_width'] peak_width = config['seq_width'] # NOTE - seq_width has to be the same for the input and the target # # infer from the command line # if seq_width is None: # logger.info("Using seq_width = peak_width") # seq_width = peak_width # # make sure these are int's # seq_width = int(seq_width) # peak_width = int(peak_width) # Split contrib_wildcards = contrib_wildcard.split(",") # Allow chr inclusion / exclusion if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = None if include_chr: include_chr = include_chr.split(",") else: include_chr = None logger.info("Loading the config files") model_dir = Path(model_dir) logger.info("Creating the dataset") from bpnet.datasets import StrandedProfile, SeqClassification if fasta_file is not None: if regions is None: raise ValueError( "fasta_file specified. Expecting regions to be specified as well" ) dl_valid = SeqClassification( fasta_file=fasta_file, intervals_file=regions, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, auto_resize_len=seq_width, ) chrom_sizes = _chrom_sizes(fasta_file) else: if dataspec is None: logger.info("Using dataspec used to train the model") # Specify dataspec dataspec = model_dir / "dataspec.yml" ds = DataSpec.load(dataspec) dl_valid = StrandedProfile(ds, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, intervals_file=regions, peak_width=peak_width, shuffle=False, seq_width=seq_width) chrom_sizes = _chrom_sizes(ds.fasta_file) # Setup contribution score trimming (not required currently) if seq_width > peak_width: # Trim # make sure we can nicely trim the peak logger.info("Trimming the output") assert (seq_width - peak_width) % 2 == 0 trim_start = (seq_width - peak_width) // 2 trim_end = seq_width - trim_start assert trim_end - trim_start == peak_width elif seq_width == peak_width: trim_start = 0 trim_end = peak_width else: raise ValueError("seq_width < peak_width") seqmodel = SeqModel.from_mdir(model_dir) # get all possible interpretation names # make sure they match the specified glob intp_names = [ name for name, _ in seqmodel.get_intp_tensors(preact_only=False) if fnmatch_any(name, contrib_wildcards) ] logger.info(f"Using the following interpretation targets:") for n in intp_names: print(n) if max_regions is not None: if len(dl_valid) > max_regions: logging.info( f"Using {max_regions} regions instead of the original {len(dl_valid)}" ) else: logging.info( f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. " "Using the dataset size for max-regions") max_regions = len(dl_valid) else: max_regions = len(dl_valid) max_batches = np.ceil(max_regions / batch_size) writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size) for i, batch in enumerate( tqdm(dl_valid.batch_iter(batch_size=batch_size, shuffle=shuffle_regions, num_workers=num_workers), total=max_batches)): # store the original batch containing 'inputs' and 'targets' if skip_bias: batch['inputs'] = { 'seq': batch['inputs']['seq'] } # ignore all other inputs if max_batches > 0: if i > max_batches: break if shuffle_seq: # Di-nucleotide shuffle the sequences batch['inputs']['seq'] = onehot_dinucl_shuffle( batch['inputs']['seq']) for name in intp_names: hyp_contrib = seqmodel.contrib_score( batch['inputs']['seq'], name=name, method=method, batch_size=None) # don't second-batch # put contribution scores to the dictionary # also trim the contribution scores appropriately so that # the output will always be w.r.t. the peak center batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end] # trim the sequence as well # Trim the sequence batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end] # ? maybe it would it be better to have an explicit ContribFileWriter. # that way the written schema would be fixed writer.batch_write(batch) # add chromosome sizes writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes) writer.close() logger.info(f"Done. Contribution score file was saved to: {output_file}")
model = get_model("DeepSEA/variantEffects") dl_kwargs = {'fasta_file': fasta_file, 'num_chr_fasta': True} output_dir = Path(args.output_dir) output_name = os.path.basename(args.vcf).split('.vcf')[0] if args.writer == "zarr": from kipoi.writers import ZarrBatchWriter, AsyncBatchWriter td = output_name + ".zarr" writer = SyncBatchWriter( AsyncBatchWriter( ZarrBatchWriter(str(output_dir / td), chunk_size=1024))) elif args.writer == "lmdb": td = output_name + ".lmdb" writer = SyncBatchWriter( AsyncSyncPredictionsWriter( LmdbBatchWriter(str(output_dir / td), "DeepSea_veff", 274578419865))) elif args.writer == "hdf5": td = output_name + ".hdf5" from kipoi.writers import HDF5BatchWriter writer = SyncBatchWriter(HDF5BatchWriter(str(output_dir / td))) print("Start predictions..") sp.score_variants(model=model, input_vcf=args.vcf, batch_size=16, num_workers=10, dl_args=dl_kwargs, output_writers=writer)
def imp_score(model_dir, output_file, method="grad", split='all', batch_size=512, num_workers=10, h5_chunk_size=512, max_batches=-1, shuffle_seq=False, memfrac=0.45, exclude_chr='', overwrite=False, gpu=None): """Run importance scores for a BPNet model Args: model_dir: path to the model directory output_file: output file path (HDF5 format) method: which importance scoring method to use ('grad', 'deeplift' or 'ism') split: for which dataset split to compute the importance scores h5_chunk_size: hdf5 chunk size. exclude_chr: comma-separated list of chromosomes to exclude overwrite: if True, overwrite the output directory gpu (int): which GPU to use locally. If None, GPU is not used """ add_file_logging(os.path.dirname(output_file), logger, 'modisco-score') if gpu is not None: create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' if os.path.exists(output_file): if overwrite: os.remove(output_file) else: raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it") if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = [] # load the config files logger.info("Loading the config files") model_dir = Path(model_dir) hp = HParams.load(model_dir / "hparams.yaml") ds = DataSpec.load(model_dir / "dataspec.yaml") tasks = list(ds.task_specs) # validate that the correct dataset was used if hp.data.name != 'get_StrandedProfile_datasets': logger.warn("hp.data.name != 'get_StrandedProfile_datasets'") if split == 'valid': assert len(exclude_chr) == 0 incl_chromosomes = hp.data.kwargs['valid_chr'] excl_chromosomes = None elif split == 'test': assert len(exclude_chr) == 0 incl_chromosomes = hp.data.kwargs['test_chr'] excl_chromosomes = None elif split == 'train': assert len(exclude_chr) == 0 incl_chromosomes = None excl_chromosomes = hp.data.kwargs['valid_chr'] + hp.data.kwargs['test_chr'] + hp.data.kwargs.get('exclude_chr', []) elif split == 'all': incl_chromosomes = None excl_chromosomes = hp.data.kwargs.get('exclude_chr', []) + exclude_chr logger.info("Excluding chromosomes: {excl_chromosomes}") else: raise ValueError("split needs to be from {train,valid,test,all}") logger.info("Creating the dataset") from basepair.datasets import StrandedProfile seq_len = hp.data.kwargs['peak_width'] dl_valid = StrandedProfile(ds, incl_chromosomes=incl_chromosomes, excl_chromosomes=excl_chromosomes, peak_width=seq_len, shuffle=False, target_transformer=None) bpnet = BPNet.from_mdir(model_dir) writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size) for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))): if max_batches > 0: logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation") if i > max_batches: break # append the bias model predictions # (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets'])) # store the original batch containing 'inputs' and 'targets' wdict = batch if shuffle_seq: # Di-nucleotide shuffle the sequences if 'seq' in batch['inputs']: batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq']) else: batch['inputs'] = onehot_dinucl_shuffle(batch['inputs']) # loop through all tasks, pred_summary and strands for task_i, task in enumerate(tasks): for pred_summary in ['count', 'weighted']: # figure out the number of channels nstrands = batch['targets'][f'profile/{task}'].shape[-1] strand_hash = ["pos", "neg"] for strand_i in range(nstrands): hyp_imp = bpnet.imp_score(batch['inputs'], task=task, strand=strand_hash[strand_i], method=method, pred_summary=pred_summary, batch_size=None) # don't second-batch # put importance scores to the dictionary wdict[f"/hyp_imp/{task}/{pred_summary}/{strand_i}"] = hyp_imp writer.batch_write(wdict) writer.close()
def imp_score_seqmodel(model_dir, output_file, dataspec=None, peak_width=1000, seq_width=None, intp_pattern='*', # specifies which imp. scores to compute # skip_trim=False, # skip trimming the output method="deeplift", batch_size=512, max_batches=-1, shuffle_seq=False, memfrac=0.45, num_workers=10, h5_chunk_size=512, exclude_chr='', include_chr='', overwrite=False, skip_bias=False, gpu=None): """Run importance scores for a BPNet model Args: model_dir: path to the model directory output_file: output file path (HDF5 format) method: which importance scoring method to use ('grad', 'deeplift' or 'ism') split: for which dataset split to compute the importance scores h5_chunk_size: hdf5 chunk size. exclude_chr: comma-separated list of chromosomes to exclude overwrite: if True, overwrite the output directory skip_bias: if True, don't store the bias tracks in teh output gpu (int): which GPU to use locally. If None, GPU is not used """ add_file_logging(os.path.dirname(output_file), logger, 'modisco-score-seqmodel') if gpu is not None: create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' if os.path.exists(output_file): if overwrite: os.remove(output_file) else: raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it") if seq_width is None: logger.info("Using seq_width = peak_width") seq_width = peak_width # make sure these are int's seq_width = int(seq_width) peak_width = int(peak_width) # Split intp_patterns = intp_pattern.split(",") # Allow chr inclusion / exclusion if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = None if include_chr: include_chr = include_chr.split(",") else: include_chr = None logger.info("Loading the config files") model_dir = Path(model_dir) if dataspec is None: # Specify dataspec dataspec = model_dir / "dataspec.yaml" ds = DataSpec.load(dataspec) logger.info("Creating the dataset") from basepair.datasets import StrandedProfile dl_valid = StrandedProfile(ds, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, peak_width=peak_width, seq_width=seq_width, shuffle=False, taskname_first=True, # Required to work nicely with imp-score target_transformer=None) # Setup importance score trimming if seq_width > peak_width: # Trim # make sure we can nicely trim the peak logger.info("Trimming the output") assert (seq_width - peak_width) % 2 == 0 trim_start = (seq_width - peak_width) // 2 trim_end = seq_width - trim_start assert trim_end - trim_start == peak_width elif seq_width == peak_width: trim_start = 0 trim_end = peak_width else: raise ValueError("seq_width < peak_width") seqmodel = SeqModel.from_mdir(model_dir) # get all possible interpretation names # make sure they match the specified glob intp_names = [name for name, _ in seqmodel.get_intp_tensors(preact_only=False) if fnmatch_any(name, intp_patterns)] logger.info(f"Using the following interpretation targets:") for n in intp_names: print(n) writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size) for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))): # store the original batch containing 'inputs' and 'targets' wdict = batch if skip_bias: wdict['inputs'] = {'seq': wdict['inputs']['seq']} # ignore all other inputs if max_batches > 0: logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation") if i > max_batches: break if shuffle_seq: # Di-nucleotide shuffle the sequences batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq']) for name in intp_names: hyp_imp = seqmodel.imp_score(batch['inputs']['seq'], name=name, method=method, batch_size=None) # don't second-batch # put importance scores to the dictionary # also trim the importance scores appropriately so that # the output will always be w.r.t. the peak center wdict[f"/hyp_imp/{name}"] = hyp_imp[:, trim_start:trim_end] # trim the sequence as well if isinstance(wdict['inputs'], dict): # Trim the sequence wdict['inputs']['seq'] = wdict['inputs']['seq'][:, trim_start:trim_end] else: wdict['inputs'] = wdict['inputs'][:, trim_start:trim_end] writer.batch_write(wdict) writer.close()
def modisco_run( imp_scores, output_dir, null_imp_scores=None, hparams=None, override_hparams="", grad_type="weighted", subset_tasks=None, filter_subset_tasks=False, filter_npy=None, exclude_chr="", seqmodel=False, # interpretation glob # hparams=None, num_workers=10, max_strand_distance=0.1, overwrite=False, skip_dist_filter=False, use_all_seqlets=False, merge_tasks=False, gpu=None, ): """ Run modisco Args: imp_scores: path to the hdf5 file of importance scores null_imp_scores: Path to the null importance scores grad_type: for which output to compute the importance scores hparams: None, modisco hyper - parameeters: either a path to modisco.yaml or a ModiscoHParams object override_hparams: hyper - parameters overriding the settings in the hparams file output_dir: output file directory filter_npy: path to a npy file containing a boolean vector used for subsetting exclude_chr: comma-separated list of chromosomes to exclude seqmodel: If enabled, then the importance scores came from `imp-score-seqmodel` subset_tasks: comma-separated list of task names to use as a subset filter_subset_tasks: if True, run modisco only in the regions for that TF hparams: hyper - parameter file summary: which summary statistic to use for the profile gradients skip_dist_filter: if True, distances are not used to filter use_all_seqlets: if True, don't restrict the number of seqlets split: On which data split to compute the results merge_task: if True, importance scores for the tasks will be merged gpu: which gpu to use. If None, don't use any GPU's Note: when using subset_tasks, modisco will run on all the importance scores. If you wish to run it only for the importance scores for a particular task you should subset it to the peak regions of interest using `filter_npy` """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-run') import os if gpu is not None: create_tf_session(gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['MKL_THREADING_LAYER'] = 'GNU' # import theano import modisco import modisco.tfmodisco_workflow.workflow if seqmodel: assert '/' in grad_type if subset_tasks == '': logger.warn("subset_tasks == ''. Not using subset_tasks") subset_tasks = None if subset_tasks == 'all': # Use all subset tasks e.g. don't subset subset_tasks = None if subset_tasks is not None: subset_tasks = subset_tasks.split(",") if len(subset_tasks) == 0: raise ValueError("Provide one or more subset_tasks. Found None") if filter_subset_tasks and subset_tasks is None: print("Using filter_subset_tasks=False since `subset_tasks` is None") filter_subset_tasks = False if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = [] output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "modisco.h5" remove_exists(output_path, overwrite) output_distances = output_dir / "strand_distances.h5" remove_exists(output_distances, overwrite) if filter_npy is not None: filter_npy = os.path.abspath(filter_npy) # save the hyper-parameters write_json( dict( imp_scores=os.path.abspath(imp_scores), grad_type=grad_type, output_dir=str(output_dir), subset_tasks=subset_tasks, filter_subset_tasks=filter_subset_tasks, hparams=hparams, null_imp_scores=null_imp_scores, # TODO - pack into hyper-parameters as well? filter_npy=filter_npy, exclude_chr=",".join(exclude_chr), skip_dist_filter=skip_dist_filter, use_all_seqlets=use_all_seqlets, max_strand_distance=max_strand_distance, gpu=gpu), os.path.join(output_dir, "kwargs.json")) print("-" * 40) # parse the hyper-parameters if hparams is None: print(f"Using default hyper-parameters") hp = ModiscoHParams() else: if isinstance(hparams, str): print(f"Loading hyper-parameters from file: {hparams}") hp = ModiscoHParams.load(hparams) else: assert isinstance(hparams, ModiscoHParams) hp = hparams if override_hparams: print(f"Overriding the following hyper-parameters: {override_hparams}") hp = tf.contrib.training.HParams( **hp.get_modisco_kwargs()).parse(override_hparams) if use_all_seqlets: hp.max_seqlets_per_metacluster = None # save the hyper-parameters print("Using the following hyper-parameters for modisco:") print("-" * 40) related_dump_yaml(ModiscoHParams(**hp.values()), os.path.join(output_dir, "hparams.yaml"), verbose=True) print("-" * 40) # TODO - replace with imp_scores d = HDF5Reader.load(imp_scores) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] if seqmodel: tasks = list(d['targets']) else: tasks = list(d['targets']['profile']) if subset_tasks is not None: # validate that all the `subset_tasks` # are present in `tasks` for st in subset_tasks: if st not in tasks: raise ValueError( f"subset task {st} not found in tasks: {tasks}") logger.info( f"Using the following tasks: {subset_tasks} instead of the original tasks: {tasks}" ) tasks = subset_tasks if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] n = len(one_hot) # -------------------- # apply filters if not skip_dist_filter: print("Using profile prediction for the strand filtering") grad_type_filtered = 'weighted' distances = np.array([ np.array([ correlation( np.ravel(d['hyp_imp'][task][grad_type_filtered][0][i]), np.ravel(d['hyp_imp'][task][grad_type_filtered][1][i])) for i in range(n) ]) for task in tasks if len(d['hyp_imp'][task][grad_type_filtered]) == 2 ]).T.mean(axis=-1) # average the distances across tasks dist_filter = distances < max_strand_distance print(f"Fraction of sequences kept: {dist_filter.mean()}") HDF5BatchWriter.dump(output_distances, { "distances": distances, "included_samples": dist_filter }) else: dist_filter = np.ones((n, ), dtype=bool) # add also the filter numpy if filter_npy is not None: print(f"Loading a filter file from {filter_npy}") filter_vec = np.load(filter_npy) dist_filter = dist_filter & filter_vec if filter_subset_tasks: assert subset_tasks is not None interval_from_task = pd.Series(d['metadata']['interval_from_task']) print( f"Subsetting the intervals accoring to subset_tasks: {subset_tasks}" ) print(f"Number of original regions: {dist_filter.sum()}") dist_filter = dist_filter & interval_from_task.isin( subset_tasks).values print( f"Number of filtered regions after filter_subset_tasks: {dist_filter.sum()}" ) # filter by chromosome if exclude_chr: logger.info(f"Excluding chromosomes: {exclude_chr}") chromosomes = d['metadata']['range']['chr'] dist_filter = dist_filter & ( ~pd.Series(chromosomes).isin(exclude_chr)).values # ------------------------------------------------------------- # setup importance scores if seqmodel: thr_one_hot = one_hot[dist_filter] thr_hypothetical_contribs = { f"{task}/{gt}": d['hyp_imp'][task][gt.split("/")[0]][gt.split("/")[1]][dist_filter] for task in tasks for gt in grad_type.split(",") } thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } task_names = [ f"{task}/{gt}" for task in tasks for gt in grad_type.split(",") ] else: if merge_tasks: thr_one_hot = np.concatenate([ one_hot[dist_filter] for task in tasks for gt in grad_type.split(",") ]) thr_hypothetical_contribs = { "merged": np.concatenate([ mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks for gt in grad_type.split(",") ]) } thr_contrib_scores = { "merged": thr_hypothetical_contribs['merged'] * thr_one_hot } task_names = ['merged'] else: thr_one_hot = one_hot[dist_filter] thr_hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks for gt in grad_type.split(",") } thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } task_names = [ f"{task}/{gt}" for task in tasks for gt in grad_type.split(",") ] if null_imp_scores is not None: logger.info(f"Using null_imp_scores: {null_imp_scores}") null_isf = ImpScoreFile(null_imp_scores) null_per_pos_scores = { f"{task}/{gt}": v.sum(axis=-1) for gt in grad_type.split(",") for task, v in null_isf.get_contrib(imp_score=gt).items() if task in tasks } else: # default Null distribution. Requires modisco 5.0 logger.info(f"Using default null_imp_scores") null_per_pos_scores = modisco.coordproducers.LaplaceNullDist( num_to_samp=10000) # ------------------------------------------------------------- # run modisco tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow( # Modisco defaults sliding_window_size=hp.sliding_window_size, flank_size=hp.flank_size, target_seqlet_fdr=hp.target_seqlet_fdr, min_passing_windows_frac=hp.min_passing_windows_frac, max_passing_windows_frac=hp.max_passing_windows_frac, min_metacluster_size=hp.min_metacluster_size, max_seqlets_per_metacluster=hp.max_seqlets_per_metacluster, seqlets_to_patterns_factory=modisco.tfmodisco_workflow. seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory( trim_to_window_size=hp.trim_to_window_size, # default: 30 initial_flank_to_add=hp.initial_flank_to_add, # default: 10 kmer_len=hp.kmer_len, # default: 8 num_gaps=hp.num_gaps, # default: 3 num_mismatches=hp.num_mismatches, # default: 2 n_cores=num_workers, final_min_cluster_size=hp.final_min_cluster_size) # default: 30 )( task_names=task_names, contrib_scores=thr_contrib_scores, # -> task score hypothetical_contribs=thr_hypothetical_contribs, one_hot=thr_one_hot, null_per_pos_scores=null_per_pos_scores) # ------------------------------------------------------------- # save the results grp = h5py.File(output_path) tfmodisco_results.save_hdf5(grp)