def log_gin_config(output_dir, cometml_experiment=None, wandb_run=None, prefix=''): """Save the config.gin file containing the whole config, convert it to a dictionary and upload it to cometml and wandb. """ gin_config_str = gin.operative_config_str() print("Used config: " + "-" * 40) print(gin_config_str) print("-" * 52) with open(os.path.join(output_dir, f"{prefix}config.gin"), "w") as f: f.write(gin_config_str) gin_config_dict = gin2dict(gin_config_str) write_json(gin_config_dict, os.path.join(output_dir, f"{prefix}config.gin.json"), sort_keys=True, indent=2) if cometml_experiment is not None: # Skip any rows starting with import cometml_experiment.log_parameters(gin_config_dict) if wandb_run is not None: # This allows to display the metric on the dashboard wandb_run.config.update( {k.replace(".", "/"): v for k, v in gin_config_dict.items()})
def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=[], save=True, **kwargs): """Evaluate the model on the validation set Args: metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s) batch_size: num_workers: eval_train: if True, also compute the evaluation metrics on the training set save: save the json file to the output directory """ if len(kwargs) > 0: logger.warn(f"Extra kwargs were provided to trainer.evaluate(): {kwargs}") # Save the complete model -> HACK self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl')) # contruct a list of dataset to evaluate if eval_train: eval_datasets = [('train', self.train_dataset)] + self.valid_dataset else: eval_datasets = self.valid_dataset # skip some datasets for evaluation try: if len(eval_skip) > 0: logger.info(f"Using eval_skip: {eval_skip}") eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip] except Exception: logger.warn(f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}") metric_res = OrderedDict() for d in eval_datasets: if len(d) == 2: dataset_name, dataset = d eval_metric = None # Ignore the provided metric elif len(d) == 3: # specialized evaluation metric was passed dataset_name, dataset, eval_metric = d else: raise ValueError("Valid dataset needs to be a list of tuples of 2 or 3 elements" "(name, dataset) or (name, dataset, metric)") logger.info(f"Evaluating dataset: {dataset_name}") metric_res[dataset_name] = self.seq_model.evaluate(dataset, eval_metric=eval_metric, num_workers=num_workers, batch_size=batch_size) if save: write_json(metric_res, self.evaluation_path, indent=2) logger.info("Saved metrics to {}".format(self.evaluation_path)) if self.cometml_experiment is not None: self.cometml_experiment.log_metrics(flatten(metric_res, separator='/'), prefix="eval/") if self.wandb_run is not None: self.wandb_run.summary.update(flatten(dict_prefix_key(metric_res, prefix="eval/"), separator='/')) return metric_res
def bpnet_train(dataspec, output_dir, premade='bpnet9', config=None, override='', gpu=0, memfrac_gpu=0.45, num_workers=8, vmtouch=False, in_memory=False, wandb_project="", cometml_project="", run_id=None, note_params="", overwrite=False): """Train a model using gin-config Output files: train.log - log file model.h5 - Keras model HDF5 file seqmodel.pkl - Serialized SeqModel. This is the main trained model. eval-report.ipynb/.html - evaluation report containing training loss curves and some example model predictions. You can specify your own ipynb using `--override='report_template.name="my-template.ipynb"'`. model.gin -> copied from the input dataspec.yaml -> copied from the input """ cometml_experiment, wandb_run, output_dir = start_experiment( output_dir=output_dir, cometml_project=cometml_project, wandb_project=wandb_project, run_id=run_id, note_params=note_params, overwrite=overwrite) # remember the executed command write_json( { "dataspec": dataspec, "output_dir": output_dir, "premade": premade, "config": config, "override": override, "gpu": gpu, "memfrac_gpu": memfrac_gpu, "num_workers": num_workers, "vmtouch": vmtouch, "in_memory": in_memory, "wandb_project": wandb_project, "cometml_project": cometml_project, "run_id": run_id, "note_params": note_params, "overwrite": overwrite }, os.path.join(output_dir, 'bpnet-train.kwargs.json'), indent=2) # copy dataspec.yml and input config file over if config is not None: shutil.copyfile(config, os.path.join(output_dir, 'input-config.gin')) # parse and validate the dataspec ds = DataSpec.load(dataspec) related_dump_yaml(ds.abspath(), os.path.join(output_dir, 'dataspec.yml')) if vmtouch: if shutil.which('vmtouch') is None: logger.warn( "vmtouch is currently not installed. " "--vmtouch disabled. Please install vmtouch to enable it") else: # use vmtouch to load all file to memory ds.touch_all_files() # -------------------------------------------- # Parse the config file # import gin.tf if gpu is not None: logger.info(f"Using gpu: {gpu}, memory fraction: {memfrac_gpu}") create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu) gin_files = _get_gin_files(premade, config) # infer differnet hyper-parameters from the dataspec file if len(ds.bias_specs) > 0: use_bias = True if len(ds.bias_specs) > 1: # TODO - allow multiple bias track # - split the heads separately raise ValueError("Only a single bias track is currently supported") bias = [v for k, v in ds.bias_specs.items()][0] n_bias_tracks = len(bias.tracks) else: use_bias = False n_bias_tracks = 0 tasks = list(ds.task_specs) # TODO - handle multiple track widths? tracks_per_task = [len(v.tracks) for k, v in ds.task_specs.items()][0] # figure out the right hyper-parameters dataspec_bindings = [ f'dataspec="{dataspec}"', f'use_bias={use_bias}', f'n_bias_tracks={n_bias_tracks}', f'tracks_per_task={tracks_per_task}', f'tasks={tasks}' ] gin.parse_config_files_and_bindings( gin_files, bindings=dataspec_bindings + override.split(";"), # NOTE: custom files were inserted right after # ther user's config file and before the `override` # parameters specified at the command-line # This allows the user to disable the bias correction # despite being specified in the config file skip_unknown=False) # -------------------------------------------- # Remember the parsed configs # comet - log environment if cometml_experiment is not None: # log other parameters cometml_experiment.log_parameters(dict(premade=premade, config=config, override=override, gin_files=gin_files, gpu=gpu), prefix='cli/') # wandb - log environment if wandb_run is not None: # store general configs wandb_run.config.update( dict_prefix_key(dict(premade=premade, config=config, override=override, gin_files=gin_files, gpu=gpu), prefix='cli/')) return train( output_dir=output_dir, cometml_experiment=cometml_experiment, wandb_run=wandb_run, num_workers=num_workers, in_memory=in_memory, # to execute the sub-notebook memfrac_gpu=memfrac_gpu, gpu=gpu)
def start_experiment(output_dir, cometml_project="", wandb_project="", run_id=None, note_params="", extra_kwargs=None, overwrite=False): """Start a model training experiment. This will create a new output directory and setup the experiment management handles """ sys.path.append(os.getcwd()) if cometml_project: logger.info("Using comet.ml") if Experiment is None: raise ImportError("Comet.ml could not be imported") workspace, project_name = cometml_project.split("/") cometml_experiment = Experiment(project_name=project_name, workspace=workspace) # TODO - get the experiment id # specify output_dir to that directory else: cometml_experiment = None if wandb_project: assert "/" in wandb_project entity, project = wandb_project.split("/") if wandb is None: logger.warn("wandb not installed. Not using it") wandb_run = None else: logger.info("Using wandb. Running wandb.init()") wandb._set_stage_dir("./") # Don't prepend wandb to output file if run_id is not None: wandb.init(project=project, dir=output_dir, entity=entity, reinit=True, resume=run_id) else: # automatically set the output wandb.init(project=project, entity=entity, reinit=True, dir=output_dir) wandb_run = wandb.run if wandb_run is None: logger.warn("Wandb run is None") print(wandb_run) else: wandb_run = None # update the output directory if run_id is None: if wandb_run is not None: run_id = os.path.basename(wandb_run.dir) elif cometml_experiment is not None: run_id = cometml_experiment.id else: # random run_id run_id = str(uuid4()) output_dir = os.path.join(output_dir, run_id) if wandb_run is not None: # make sure the output directory is the same # wandb_run._dir = os.path.normpath(output_dir) # This doesn't work # assert os.path.normpath(wandb_run.dir) == os.path.normpath(output_dir) # TODO - fix this assertion-> the output directories should be the same # in order for snakemake to work correctly pass # ----------------------------- if os.path.exists(os.path.join(output_dir, 'config.gin')): if overwrite: logger.info( f"config.gin already exists in the output " "directory {output_dir}. Removing the whole directory.") shutil.rmtree(output_dir) else: raise ValueError(f"Output directory {output_dir} shouldn't exist!") os.makedirs(output_dir, exist_ok=True) # make the output directory. It shouldn't exist # add logging to the file add_file_logging(output_dir, logger) # write note_params.json if note_params: logger.info(f"note_params: {note_params}") note_params_dict = kv_string2dict(note_params) else: note_params_dict = dict() write_json(note_params_dict, os.path.join(output_dir, "note_params.json"), sort_keys=True, indent=2) if cometml_experiment is not None: cometml_experiment.log_parameters(note_params_dict) cometml_experiment.log_parameters(dict(output_dir=output_dir), prefix='cli/') exp_url = f"https://www.comet.ml/{cometml_experiment.workspace}/{cometml_experiment.project_name}/{cometml_experiment.id}" logger.info("Comet.ml url: " + exp_url) # write the information about comet.ml experiment write_json( { "url": exp_url, "key": cometml_experiment.id, "project": cometml_experiment.project_name, "workspace": cometml_experiment.workspace }, os.path.join(output_dir, "cometml.json"), sort_keys=True, indent=2) if wandb_run is not None: wandb_run.config.update(note_params_dict) write_json( { "url": wandb_run.get_url(), "key": wandb_run.id, "project": wandb_run.project, "path": wandb_run.path, "group": wandb_run.group }, os.path.join(output_dir, "wandb.json"), sort_keys=True, indent=2) wandb_run.config.update( dict_prefix_key(dict(output_dir=output_dir), prefix='cli/')) return cometml_experiment, wandb_run, output_dir
def cwm_scan(modisco_dir, output_file, trim_frac=0.08, patterns='all', filters='match_weighted_p>=.2,contrib_weighted_p>=.01', contrib_file=None, add_profile_features=False, num_workers=10): """Get motif instances via CWM scanning. """ from bpnet.modisco.utils import longer_pattern, shorten_pattern from bpnet.modisco.pattern_instances import annotate_profile_single add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan') modisco_dir = Path(modisco_dir) valid_suffixes = [ '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.parq', '.bed', '.bed.gz', ] if not any([output_file.endswith(suffix) for suffix in valid_suffixes]): raise ValueError( f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}" ) # Centroid matches path cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz' # save the hyper-parameters kwargs_json_file = os.path.join(os.path.dirname(output_file), 'cwm-scan.kwargs.json') write_json( dict(modisco_dir=os.path.abspath(str(contrib_file)), output_file=str(output_file), cwm_scan_seqlets_path=str(cm_path), trim_frac=trim_frac, patterns=patterns, filters=filters, contrib_file=contrib_file, add_profile_features=add_profile_features, num_workers=num_workers), str(kwargs_json_file)) # figure out contrib_wildcard modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) mf = ModiscoFile(modisco_dir / "modisco.h5") tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] logger.info(f"Using tasks: {tasks}") if contrib_file is None: cf = ContribFile.from_modisco_dir(modisco_dir) cf.cache( ) # cache it since it can be re-used in `modisco_centroid_seqlet_matches` else: logger.info(f"Loading the contribution scores from: {contrib_file}") cf = ContribFile(contrib_file, default_contrib_score=contrib_type) if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") cwm_scan_seqlets(modisco_dir, output_file=cm_path, trim_frac=trim_frac, contribsf=cf if contrib_file is None else None, num_workers=num_workers, verbose=False) else: logger.info("Centroid matches already exist.") logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) # get the raw data seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges() logger.info("Scanning for patterns") dfl = [] # patterns to scan. `longer_pattern` makes sure the patterns are in the long format scan_patterns = patterns.split( ",") if patterns is not 'all' else mf.pattern_names() scan_patterns = [longer_pattern(pn) for pn in scan_patterns] if add_profile_features: profile = cf.get_profiles() logger.info("Profile features will also be added to dfi") for pattern_name in tqdm(mf.pattern_names()): if pattern_name not in scan_patterns: # skip scanning that patterns continue pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, contribution = pattern.scan_contribution(contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False) dfm = pattern.get_instances( tasks, match, contribution, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) for filt in filters.split(","): if len(filt) > 0: dfm = dfm.query(filt) if add_profile_features: dfm = annotate_profile_single(dfm, pattern_name, mf, profile, profile_width=70, trim_frac=trim_frac) dfm['pattern_short'] = shorten_pattern(pattern_name) # TODO - is it possible to write out the results incrementally? dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') # add the absolute coordinates dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start'] dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end'] logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...) bed_columns = [ 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern', 'contrib_weighted_p', 'strand', 'match_weighted_p' ] dfp = pd_first_cols(dfp, bed_columns) # write to a parquet file if output_file.endswith(".parq"): logger.info("Writing a parquet file") dfp.to_parquet(output_file, partition_on=['pattern_short'], engine='fastparquet') elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"): logger.info("Writing a csv file") dfp.to_csv(output_file, compression='infer', index=False) elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"): logger.info("Writing a tsv file") dfp.to_csv(output_file, sep='\t', compression='infer', index=False) elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"): logger.info("Writing a BED file") # write only the first (and main) 7 columns dfp[bed_columns].to_csv(output_file, sep='\t', compression='infer', index=False, header=False) else: logger.warn("File suffix not recognized. Using .csv.gz file format") dfp.to_csv(output_file, compression='gzip', index=False) logger.info("Done!")
def bpnet_modisco_run( contrib_file, output_dir, null_contrib_file=None, premade='modisco-50k', config=None, override='', contrib_wildcard="*/profile/wn", # on which contribution scores to run modisco only_task_regions=False, filter_npy=None, exclude_chr="", num_workers=10, gpu=None, # no need to use a gpu by default memfrac_gpu=0.45, overwrite=False, ): """Run TF-MoDISco on the contribution scores stored in the contribution score file generated by `bpnet contrib`. """ add_file_logging(output_dir, logger, 'modisco-run') if gpu is not None: logger.info(f"Using gpu: {gpu}, memory fraction: {memfrac_gpu}") create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['MKL_THREADING_LAYER'] = 'GNU' import modisco assert '/' in contrib_wildcard if filter_npy is not None: filter_npy = os.path.abspath(str(filter_npy)) if config is not None: config = os.path.abspath(str(config)) # setup output file paths output_path = os.path.abspath(os.path.join(output_dir, "modisco.h5")) remove_exists(output_path, overwrite=overwrite) output_filter_npy = os.path.abspath( os.path.join(output_dir, 'modisco-run.subset-contrib-file.npy')) remove_exists(output_filter_npy, overwrite=overwrite) kwargs_json_file = os.path.join(output_dir, "modisco-run.kwargs.json") remove_exists(kwargs_json_file, overwrite=overwrite) if config is not None: config_output_file = os.path.join(output_dir, 'modisco-run.input-config.gin') remove_exists(config_output_file, overwrite=overwrite) shutil.copyfile(config, config_output_file) # save the hyper-parameters write_json( dict(contrib_file=os.path.abspath(contrib_file), output_dir=str(output_dir), null_contrib_file=null_contrib_file, config=str(config), override=override, contrib_wildcard=contrib_wildcard, only_task_regions=only_task_regions, filter_npy=str(filter_npy), exclude_chr=exclude_chr, num_workers=num_workers, overwrite=overwrite, output_filter_npy=output_filter_npy, gpu=gpu, memfrac_gpu=memfrac_gpu), kwargs_json_file) # setup the gin config using premade, config and override cli_bindings = [f'num_workers={num_workers}'] gin.parse_config_files_and_bindings( _get_gin_files(premade, config), bindings=cli_bindings + override.split(";"), # NOTE: custom files were inserted right after # ther user's config file and before the `override` # parameters specified at the command-line skip_unknown=False) log_gin_config(output_dir, prefix='modisco-run.') # -------------------------------------------- # load the contribution file logger.info(f"Loading the contribution file: {contrib_file}") cf = ContribFile(contrib_file) tasks = cf.get_tasks() # figure out subset_tasks subset_tasks = set() for w in contrib_wildcard.split(","): task, head, head_summary = w.split("/") if task == '*': subset_tasks = None else: if task not in tasks: raise ValueError(f"task {task} not found in tasks: {tasks}") subset_tasks.add(task) if subset_tasks is not None: subset_tasks = list(subset_tasks) # -------------------------------------------- # subset the intervals logger.info(f"Loading ranges") ranges = cf.get_ranges() # include all samples at the beginning include_samples = np.ones(len(cf)).astype(bool) # --only-task-regions if only_task_regions: if subset_tasks is None: logger.warn( "contrib_wildcard contains all tasks (specified by */<head>/<summary>). Not using --only-task-regions" ) elif np.all(ranges['interval_from_task'] == ''): raise ValueError( "Contribution file wasn't created from multiple set of peaks. " "E.g. interval_from_task='' for all ranges. Please disable --only-task-regions" ) else: logger.info(f"Subsetting ranges according to `interval_from_task`") include_samples = include_samples & ranges[ 'interval_from_task'].isin(subset_tasks).values logger.info( f"Using {include_samples.sum()} / {len(include_samples)} regions after --only-task-regions subset" ) # --exclude-chr if exclude_chr: logger.info(f"Excluding chromosomes: {exclude_chr}") chromosomes = ranges['chr'] include_samples = include_samples & ( ~pd.Series(chromosomes).isin(exclude_chr)).values logger.info( f"Using {include_samples.sum()} / {len(include_samples)} regions after --exclude-chr subset" ) # -- filter-npy if filter_npy is not None: print(f"Loading a filter file from {filter_npy}") include_samples = include_samples & np.load(filter_npy) logger.info( f"Using {include_samples.sum()} / {len(include_samples)} regions after --filter-npy subset" ) # store the subset-contrib-file.npy logger.info( f"Saving the included samples from ContribFile to {output_filter_npy}") np.save(output_filter_npy, include_samples) # -------------------------------------------- # convert to indices idx = np.arange(len(include_samples))[include_samples] seqs = cf.get_seq(idx=idx) # fetch the contribution scores from the importance score file # expand * to use all possible values # TODO - allow this to be done also for all the heads? hyp_contrib = {} task_names = [] for w in contrib_wildcard.split(","): wc_task, head, head_summary = w.split("/") if task == '*': use_tasks = tasks else: use_tasks = [wc_task] for task in use_tasks: key = f"{task}/{head}/{head_summary}" task_names.append(key) hyp_contrib[key] = cf._subset(cf.data[f'/hyp_contrib/{key}'], idx=idx) contrib = {k: v * seqs for k, v in hyp_contrib.items()} if null_contrib_file is not None: logger.info(f"Using null-contrib-file: {null_contrib_file}") null_cf = ContribFile(null_contrib_file) null_seqs = null_cf.get_seq() null_per_pos_scores = { key: null_seqs * null_cf.data[f'/hyp_contrib/{key}'][:] for key in task_names } else: # default Null distribution. Requires modisco 5.0 logger.info(f"Using default null_contrib_scores") null_per_pos_scores = modisco.coordproducers.LaplaceNullDist( num_to_samp=10000) # run modisco. # NOTE: `workflow` and `report` parameters are provided by gin config files modisco_run(task_names=task_names, output_path=output_path, contrib_scores=contrib, hypothetical_contribs=hyp_contrib, one_hot=seqs, null_per_pos_scores=null_per_pos_scores) logger.info( f"bpnet modisco-run finished. modisco.h5 and other files can be found in: {output_dir}" )