def modisco_run( output_path, # specified by bpnet_modisco_run task_names, contrib_scores, hypothetical_contribs, one_hot, null_per_pos_scores, # specified by gin-config workflow=gin.REQUIRED, # TfModiscoWorkflow report=None): # reports to use """ Args: workflow: TfModiscoWorkflow objects report: path to the report ipynb """ import h5py modisco_results = workflow(task_names=task_names, contrib_scores=contrib_scores, hypothetical_contribs=hypothetical_contribs, one_hot=one_hot, null_per_pos_scores=null_per_pos_scores) # save the results logger.info(f"Saving modisco file to {output_path}") grp = h5py.File(output_path) modisco_results.save_hdf5(grp) grp.flush() grp.close() if report is not None: if report is not None: report = os.path.abspath(os.path.expanduser(report)) if not os.path.exists(report): raise ValueError(f"Report file {report} doesn't exist") logger.info("Running the report") # Run the jupyter notebook report_path = os.path.join(os.path.dirname(output_path), os.path.basename(report)) render_ipynb(report, report_path, params=dict(modisco_file=output_path, modisco_dir=os.path.dirname(output_path))) logger.info(f"Done rendering the report file: {report_path}")
def ipynb_render(input_ipynb, output_ipynb, params=""): from bpnet.utils import render_ipynb, kwargs_str2kwargs render_ipynb(input_ipynb, output_ipynb, kwargs_str2kwargs(params))
def train( output_dir, model=gin.REQUIRED, data=gin.REQUIRED, eval_metric=None, eval_train=False, eval_skip=[], trainer_cls=SeqModelTrainer, eval_report=None, # shared batch_size=256, # train-specific epochs=100, early_stop_patience=4, train_epoch_frac=1.0, valid_epoch_frac=1.0, train_samples_per_epoch=None, validation_samples=None, train_batch_sampler=None, stratified_sampler_p=None, tensorboard=True, seed=None, # specified by bpnet_train in_memory=False, num_workers=8, gpu=None, memfrac_gpu=None, cometml_experiment=None, wandb_run=None, ): """Main entry point to configure in the gin config Args: model: compiled keras model data: tuple of (train, valid) Datasets eval_train: if True, also compute the evaluation metrics for the final model on the training set eval_report: path to the ipynb report file. Use the default one. If set to empty string, the report will not be generated. eval_skip List[str]: datasets to skip during evaluation seed: random seed to use (in numpy and tensorflow) """ # from this point on, no configurable should be added. Save the gin config log_gin_config(output_dir, cometml_experiment, wandb_run) train_dataset, valid_dataset = data[0], data[1] if eval_report is not None: eval_report = os.path.abspath(os.path.expanduser(eval_report)) if not os.path.exists(eval_report): raise ValueError(f"Evaluation report {eval_report} doesn't exist") if seed is not None: # Set the random seed import random random.seed(seed) np.random.seed(seed) try: import tensorflow as tf tf.set_random_seed(seed) except Exception: logger.info("Unable to set random seed for tensorflow") # make sure the validation dataset names are unique if isinstance(valid_dataset, list): dataset_names = [] for d in valid_dataset: dataset_name = d[0] if dataset_name in dataset_names: raise ValueError("The dataset names are not unique") dataset_names.append(dataset_name) if stratified_sampler_p is not None and train_batch_sampler is not None: raise ValueError( "stratified_sampler_p and train_batch_sampler are mutually exclusive." " Please specify only one of them.") if stratified_sampler_p is not None and train_batch_sampler is None: # HACK - there is no guarantee that train_dataset.get_targets() will exist # Maybe we have to introduce a ClassificationDataset instead which will # always implement get_targets() logger.info( f"Using stratified samplers with p: {stratified_sampler_p}") train_batch_sampler = samplers.StratifiedRandomBatchSampler( train_dataset.get_targets().max(axis=1), batch_size=batch_size, p_vec=stratified_sampler_p, verbose=True) num_workers_orig = num_workers # remember the old number of workers before overwriting it if in_memory: # load the training datasets to memory logger.info("Loading the training data into memory") train_dataset = NumpyDataset( train_dataset.load_all(batch_size=batch_size, num_workers=num_workers)) logger.info("Loading the validation data into memory") if isinstance(valid_dataset, list): # appropriately handle the scenario where multiple # validation data may be provided as a list of (name, Dataset) tuples valid_dataset = [(k, NumpyDataset( data.load_all(batch_size=batch_size, num_workers=num_workers))) for k, data in valid_dataset] else: # only a single Dataset was provided valid_dataset = NumpyDataset( valid_dataset.load_all(batch_size=batch_size, num_workers=num_workers)) num_workers = 1 # don't use multi-processing any more tr = trainer_cls(model, train_dataset, valid_dataset, output_dir, cometml_experiment, wandb_run) tr.train(batch_size=batch_size, epochs=epochs, early_stop_patience=early_stop_patience, num_workers=num_workers, train_epoch_frac=train_epoch_frac, valid_epoch_frac=valid_epoch_frac, train_samples_per_epoch=train_samples_per_epoch, validation_samples=validation_samples, train_batch_sampler=train_batch_sampler, tensorboard=tensorboard) final_metrics = tr.evaluate(eval_metric, batch_size=batch_size, num_workers=num_workers, eval_train=eval_train, eval_skip=eval_skip, save=True) # pass logger.info("Done!") print("-" * 40) print("Final metrics: ") print(json.dumps(final_metrics, cls=NumpyAwareJSONEncoder, indent=2)) if eval_report is not None: logger.info("Running the evaluation report") # Release the GPU K.clear_session() # remove memory del tr, train_dataset, valid_dataset, data gc.collect() if num_workers_orig != num_workers: # recover the original number of workers num_workers = num_workers_orig # Run the jupyter notebook render_ipynb(eval_report, os.path.join(output_dir, os.path.basename(eval_report)), params=dict(model_dir=os.path.abspath(output_dir), gpu=gpu, memfrac_gpu=memfrac_gpu, in_memory=in_memory, num_workers=num_workers)) # upload all files in output_dir to comet.ml # Note: wandb does this automatically if cometml_experiment is not None: logger.info("Uploading files to comet.ml") cometml_experiment.log_asset_folder(folder=output_dir) logger.info( f"Done training and evaluating the model. Model and metrics can be found in: {output_dir}" ) return final_metrics
def modisco_report(modisco_dir, output_dir): render_ipynb(os.path.join(this_path, "../templates/modisco-chip.ipynb"), os.path.join(output_dir, "modisco-chip.ipynb"), params=dict(modisco_dir=modisco_dir))