if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Run motif simulation')
    parser.add_argument('exp', help='which experiment to run the simulation for')
    parser.add_argument('--gpu', default=0, type=int, help='Which GPU to use')
    args = parser.parse_args()

    exp = args.exp
    # imp_score = args.imp_score
    ddir = get_data_dir()

    # load the model
    logger.info("Loading model")
    if args.gpu is not None:
        create_tf_session(args.gpu)
    model_dir = models_dir / exp
    bpnet = BPNetSeqModel.from_mdir(model_dir)

    output_dir = model_dir / 'perturbation-analysis'

    pairs = get_motif_pairs(motifs)

    dfi = multiple_load_instances({task: model_dir / f'deeplift/{task}/out/profile/wn/instances.parq'
                                   for task in bpnet.tasks},
                                  motifs=motifs)
    dfi = filter_nonoverlapping_intervals(dfi)
    # dfi = dfi.iloc[:1000]

    tasks = bpnet.tasks
示例#2
0
def evaluate(model_dir,
             output_dir=None,
             gpu=0,
             exclude_metrics=False,
             splits=['train', 'valid'],
             model_path=None,
             data=None,
             hparams=None,
             dataspec=None,
             preprocessor=None):
    """
    Args:
      model_dir: path to the model directory
      splits: For which data splits to compute the evaluation metrics
      model_metrics: if True, metrics computed using mode.evaluate(..)
    """
    if gpu is not None:
        create_tf_session(gpu)
    if dataspec is not None:
        ds = DataSpec.load(dataspec)
    else:
        ds = DataSpec.load(os.path.join(model_dir, "dataspec.yaml"))
    if hparams is not None:
        hp = HParams.load(hparams)
    else:
        hp = HParams.load(os.path.join(model_dir, "hparams.yaml"))
    if model_path is not None:
        model = load_model(model_path)
    else:
        model = load_model(os.path.join(model_dir, "model.h5"))
    if output_dir is None:
        output_dir = os.path.join(model_dir, "eval")
    train, valid, test = load_data(model_dir,
                                   dataspec=dataspec,
                                   hparams=hparams,
                                   data=data,
                                   preprocessor=preprocessor)
    data = dict(train=train, valid=valid, test=test)

    metrics = {}
    profile_metrics = []
    os.makedirs(os.path.join(output_dir, "plots"),
                exist_ok=True)
    for split in tqdm(splits):
        y_pred = model.predict(data[split][0])
        y_true = data[split][1]
        if not exclude_metrics:
            eval_metrics_values = model.evaluate(data[split][0],
                                                 data[split][1])
            eval_metrics = dict(zip(_listify(model.metrics_names),
                                    _listify(eval_metrics_values)))
            eval_metrics = {split + "/" + k.replace("_", "/"): v
                            for k, v in eval_metrics.items()}
            metrics = {**eval_metrics, **metrics}
        for task in ds.task_specs:
            # Counts

            yp = y_pred[ds.task2idx(task, "counts")].sum(axis=-1)
            yt = y_true["counts/" + task].sum(axis=-1)
            # compute the correlation
            rp = pearsonr(yt, yp)[0]
            rs = spearmanr(yt, yp)[0]
            metrics = {**metrics,
                       split + f"/counts/{task}/pearsonr": rp,
                       split + f"/counts/{task}/spearmanr": rs,
                       }

            fig = plt.figure(figsize=(5, 5))
            plt.scatter(yp, yt, alpha=0.5)
            plt.xlabel("Predicted")
            plt.ylabel("Observed")
            plt.title(f"R_pearson={rp:.2f}, R_spearman={rs:.2f}")
            plt.savefig(os.path.join(output_dir, f"plots/counts.{split}.{task}.png"))

            # Profile
            yp = softmax(y_pred[ds.task2idx(task, "profile")])
            yt = y_true["profile/" + task]
            df = eval_profile(yt, yp,
                              pos_min_threshold=hp.evaluate.pos_min_threshold,
                              neg_max_threshold=hp.evaluate.neg_max_threshold,
                              required_min_pos_counts=hp.evaluate.required_min_pos_counts,
                              binsizes=hp.evaluate.binsizes)
            df['task'] = task
            df['split'] = split
            # Evaluate for the smallest binsize
            auprc_min = df[df.binsize == min(hp.evaluate.binsizes)].iloc[0].auprc
            metrics[split + f'/profile/{task}/auprc'] = auprc_min
            profile_metrics.append(df)

    # Write the count metrics
    write_json(metrics, os.path.join(output_dir, "metrics.json"))

    # write the profile metrics
    dfm = pd.concat(profile_metrics)
    dfm.to_csv(os.path.join(output_dir, "profile_metrics.tsv"),
               sep='\t', index=False)
    return dfm, metrics
示例#3
0
def imp_score_seqmodel(model_dir,
                       output_file,
                       dataspec=None,
                       peak_width=1000,
                       seq_width=None,
                       intp_pattern='*',  # specifies which imp. scores to compute
                       # skip_trim=False,  # skip trimming the output
                       method="deeplift",
                       batch_size=512,
                       max_batches=-1,
                       shuffle_seq=False,
                       memfrac=0.45,
                       num_workers=10,
                       h5_chunk_size=512,
                       exclude_chr='',
                       include_chr='',
                       overwrite=False,
                       skip_bias=False,
                       gpu=None):
    """Run importance scores for a BPNet model

    Args:
      model_dir: path to the model directory
      output_file: output file path (HDF5 format)
      method: which importance scoring method to use ('grad', 'deeplift' or 'ism')
      split: for which dataset split to compute the importance scores
      h5_chunk_size: hdf5 chunk size.
      exclude_chr: comma-separated list of chromosomes to exclude
      overwrite: if True, overwrite the output directory
      skip_bias: if True, don't store the bias tracks in teh output
      gpu (int): which GPU to use locally. If None, GPU is not used
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score-seqmodel')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")

    if seq_width is None:
        logger.info("Using seq_width = peak_width")
        seq_width = peak_width

    # make sure these are int's
    seq_width = int(seq_width)
    peak_width = int(peak_width)

    # Split
    intp_patterns = intp_pattern.split(",")

    # Allow chr inclusion / exclusion
    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = None
    if include_chr:
        include_chr = include_chr.split(",")
    else:
        include_chr = None

    logger.info("Loading the config files")
    model_dir = Path(model_dir)

    if dataspec is None:
        # Specify dataspec
        dataspec = model_dir / "dataspec.yaml"
    ds = DataSpec.load(dataspec)

    logger.info("Creating the dataset")
    from basepair.datasets import StrandedProfile
    dl_valid = StrandedProfile(ds,
                               incl_chromosomes=include_chr,
                               excl_chromosomes=exclude_chr,
                               peak_width=peak_width,
                               seq_width=seq_width,
                               shuffle=False,
                               taskname_first=True,  # Required to work nicely with imp-score
                               target_transformer=None)

    # Setup importance score trimming
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        logger.info("Trimming the output")
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")

    seqmodel = SeqModel.from_mdir(model_dir)

    # get all possible interpretation names
    # make sure they match the specified glob
    intp_names = [name for name, _ in seqmodel.get_intp_tensors(preact_only=False)
                  if fnmatch_any(name, intp_patterns)]
    logger.info(f"Using the following interpretation targets:")
    for n in intp_names:
        print(n)

    writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size)
    for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
        # store the original batch containing 'inputs' and 'targets'
        wdict = batch
        if skip_bias:
            wdict['inputs'] = {'seq': wdict['inputs']['seq']}  # ignore all other inputs

        if max_batches > 0:
            logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation")
            if i > max_batches:
                break

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq'])

        for name in intp_names:
            hyp_imp = seqmodel.imp_score(batch['inputs']['seq'],
                                         name=name,
                                         method=method,
                                         batch_size=None)  # don't second-batch

            # put importance scores to the dictionary
            # also trim the importance scores appropriately so that
            # the output will always be w.r.t. the peak center
            wdict[f"/hyp_imp/{name}"] = hyp_imp[:, trim_start:trim_end]

        # trim the sequence as well
        if isinstance(wdict['inputs'], dict):
            # Trim the sequence
            wdict['inputs']['seq'] = wdict['inputs']['seq'][:, trim_start:trim_end]
        else:
            wdict['inputs'] = wdict['inputs'][:, trim_start:trim_end]

        writer.batch_write(wdict)
    writer.close()
示例#4
0
def imp_score(model_dir,
              output_file,
              method="grad",
              split='all',
              batch_size=512,
              num_workers=10,
              h5_chunk_size=512,
              max_batches=-1,
              shuffle_seq=False,
              memfrac=0.45,
              exclude_chr='',
              overwrite=False,
              gpu=None):
    """Run importance scores for a BPNet model

    Args:
      model_dir: path to the model directory
      output_file: output file path (HDF5 format)
      method: which importance scoring method to use ('grad', 'deeplift' or 'ism')
      split: for which dataset split to compute the importance scores
      h5_chunk_size: hdf5 chunk size.
      exclude_chr: comma-separated list of chromosomes to exclude
      overwrite: if True, overwrite the output directory
      gpu (int): which GPU to use locally. If None, GPU is not used
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")

    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = []
    # load the config files
    logger.info("Loading the config files")
    model_dir = Path(model_dir)
    hp = HParams.load(model_dir / "hparams.yaml")
    ds = DataSpec.load(model_dir / "dataspec.yaml")
    tasks = list(ds.task_specs)
    # validate that the correct dataset was used
    if hp.data.name != 'get_StrandedProfile_datasets':
        logger.warn("hp.data.name != 'get_StrandedProfile_datasets'")

    if split == 'valid':
        assert len(exclude_chr) == 0
        incl_chromosomes = hp.data.kwargs['valid_chr']
        excl_chromosomes = None
    elif split == 'test':
        assert len(exclude_chr) == 0
        incl_chromosomes = hp.data.kwargs['test_chr']
        excl_chromosomes = None
    elif split == 'train':
        assert len(exclude_chr) == 0
        incl_chromosomes = None
        excl_chromosomes = hp.data.kwargs['valid_chr'] + hp.data.kwargs['test_chr'] + hp.data.kwargs.get('exclude_chr', [])
    elif split == 'all':
        incl_chromosomes = None
        excl_chromosomes = hp.data.kwargs.get('exclude_chr', []) + exclude_chr
        logger.info("Excluding chromosomes: {excl_chromosomes}")
    else:
        raise ValueError("split needs to be from {train,valid,test,all}")

    logger.info("Creating the dataset")
    from basepair.datasets import StrandedProfile
    seq_len = hp.data.kwargs['peak_width']
    dl_valid = StrandedProfile(ds,
                               incl_chromosomes=incl_chromosomes,
                               excl_chromosomes=excl_chromosomes,
                               peak_width=seq_len,
                               shuffle=False,
                               target_transformer=None)

    bpnet = BPNet.from_mdir(model_dir)

    writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size)
    for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
        if max_batches > 0:
            logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation")
            if i > max_batches:
                break
        # append the bias model predictions
        # (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets']))

        # store the original batch containing 'inputs' and 'targets'
        wdict = batch

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            if 'seq' in batch['inputs']:
                batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq'])
            else:
                batch['inputs'] = onehot_dinucl_shuffle(batch['inputs'])

        # loop through all tasks, pred_summary and strands
        for task_i, task in enumerate(tasks):
            for pred_summary in ['count', 'weighted']:
                # figure out the number of channels
                nstrands = batch['targets'][f'profile/{task}'].shape[-1]
                strand_hash = ["pos", "neg"]

                for strand_i in range(nstrands):
                    hyp_imp = bpnet.imp_score(batch['inputs'],
                                              task=task,
                                              strand=strand_hash[strand_i],
                                              method=method,
                                              pred_summary=pred_summary,
                                              batch_size=None)  # don't second-batch
                    # put importance scores to the dictionary
                    wdict[f"/hyp_imp/{task}/{pred_summary}/{strand_i}"] = hyp_imp
        writer.batch_write(wdict)
    writer.close()
示例#5
0
def modisco_run(
    imp_scores,
    output_dir,
    null_imp_scores=None,
    hparams=None,
    override_hparams="",
    grad_type="weighted",
    subset_tasks=None,
    filter_subset_tasks=False,
    filter_npy=None,
    exclude_chr="",
    seqmodel=False,  # interpretation glob
    # hparams=None,
    num_workers=10,
    max_strand_distance=0.1,
    overwrite=False,
    skip_dist_filter=False,
    use_all_seqlets=False,
    merge_tasks=False,
    gpu=None,
):
    """
    Run modisco

    Args:
      imp_scores: path to the hdf5 file of importance scores
      null_imp_scores: Path to the null importance scores
      grad_type: for which output to compute the importance scores
      hparams: None, modisco hyper - parameeters: either a path to modisco.yaml or
        a ModiscoHParams object
      override_hparams: hyper - parameters overriding the settings in the hparams file
      output_dir: output file directory
      filter_npy: path to a npy file containing a boolean vector used for subsetting
      exclude_chr: comma-separated list of chromosomes to exclude
      seqmodel: If enabled, then the importance scores came from `imp-score-seqmodel`
      subset_tasks: comma-separated list of task names to use as a subset
      filter_subset_tasks: if True, run modisco only in the regions for that TF
      hparams: hyper - parameter file
      summary: which summary statistic to use for the profile gradients
      skip_dist_filter: if True, distances are not used to filter
      use_all_seqlets: if True, don't restrict the number of seqlets
      split: On which data split to compute the results
      merge_task: if True, importance scores for the tasks will be merged
      gpu: which gpu to use. If None, don't use any GPU's

    Note: when using subset_tasks, modisco will run on all the importance scores. If you wish
      to run it only for the importance scores for a particular task you should subset it to
      the peak regions of interest using `filter_npy`
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-run')
    import os
    if gpu is not None:
        create_tf_session(gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    os.environ['MKL_THREADING_LAYER'] = 'GNU'
    # import theano
    import modisco
    import modisco.tfmodisco_workflow.workflow

    if seqmodel:
        assert '/' in grad_type

    if subset_tasks == '':
        logger.warn("subset_tasks == ''. Not using subset_tasks")
        subset_tasks = None

    if subset_tasks == 'all':
        # Use all subset tasks e.g. don't subset
        subset_tasks = None

    if subset_tasks is not None:
        subset_tasks = subset_tasks.split(",")
        if len(subset_tasks) == 0:
            raise ValueError("Provide one or more subset_tasks. Found None")

    if filter_subset_tasks and subset_tasks is None:
        print("Using filter_subset_tasks=False since `subset_tasks` is None")
        filter_subset_tasks = False

    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = []

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    output_path = output_dir / "modisco.h5"
    remove_exists(output_path, overwrite)

    output_distances = output_dir / "strand_distances.h5"
    remove_exists(output_distances, overwrite)

    if filter_npy is not None:
        filter_npy = os.path.abspath(filter_npy)

    # save the hyper-parameters
    write_json(
        dict(
            imp_scores=os.path.abspath(imp_scores),
            grad_type=grad_type,
            output_dir=str(output_dir),
            subset_tasks=subset_tasks,
            filter_subset_tasks=filter_subset_tasks,
            hparams=hparams,
            null_imp_scores=null_imp_scores,
            # TODO - pack into hyper-parameters as well?
            filter_npy=filter_npy,
            exclude_chr=",".join(exclude_chr),
            skip_dist_filter=skip_dist_filter,
            use_all_seqlets=use_all_seqlets,
            max_strand_distance=max_strand_distance,
            gpu=gpu),
        os.path.join(output_dir, "kwargs.json"))

    print("-" * 40)
    # parse the hyper-parameters
    if hparams is None:
        print(f"Using default hyper-parameters")
        hp = ModiscoHParams()
    else:
        if isinstance(hparams, str):
            print(f"Loading hyper-parameters from file: {hparams}")
            hp = ModiscoHParams.load(hparams)
        else:
            assert isinstance(hparams, ModiscoHParams)
            hp = hparams
    if override_hparams:
        print(f"Overriding the following hyper-parameters: {override_hparams}")
    hp = tf.contrib.training.HParams(
        **hp.get_modisco_kwargs()).parse(override_hparams)

    if use_all_seqlets:
        hp.max_seqlets_per_metacluster = None

    # save the hyper-parameters
    print("Using the following hyper-parameters for modisco:")
    print("-" * 40)
    related_dump_yaml(ModiscoHParams(**hp.values()),
                      os.path.join(output_dir, "hparams.yaml"),
                      verbose=True)
    print("-" * 40)

    # TODO - replace with imp_scores
    d = HDF5Reader.load(imp_scores)
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    if seqmodel:
        tasks = list(d['targets'])
    else:
        tasks = list(d['targets']['profile'])

    if subset_tasks is not None:
        # validate that all the `subset_tasks`
        # are present in `tasks`
        for st in subset_tasks:
            if st not in tasks:
                raise ValueError(
                    f"subset task {st} not found in tasks: {tasks}")
        logger.info(
            f"Using the following tasks: {subset_tasks} instead of the original tasks: {tasks}"
        )
        tasks = subset_tasks

    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']

    n = len(one_hot)

    # --------------------
    # apply filters
    if not skip_dist_filter:
        print("Using profile prediction for the strand filtering")
        grad_type_filtered = 'weighted'
        distances = np.array([
            np.array([
                correlation(
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][0][i]),
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][1][i]))
                for i in range(n)
            ]) for task in tasks
            if len(d['hyp_imp'][task][grad_type_filtered]) == 2
        ]).T.mean(axis=-1)  # average the distances across tasks

        dist_filter = distances < max_strand_distance
        print(f"Fraction of sequences kept: {dist_filter.mean()}")

        HDF5BatchWriter.dump(output_distances, {
            "distances": distances,
            "included_samples": dist_filter
        })
    else:
        dist_filter = np.ones((n, ), dtype=bool)

    # add also the filter numpy
    if filter_npy is not None:
        print(f"Loading a filter file from {filter_npy}")
        filter_vec = np.load(filter_npy)
        dist_filter = dist_filter & filter_vec

    if filter_subset_tasks:
        assert subset_tasks is not None
        interval_from_task = pd.Series(d['metadata']['interval_from_task'])
        print(
            f"Subsetting the intervals accoring to subset_tasks: {subset_tasks}"
        )
        print(f"Number of original regions: {dist_filter.sum()}")
        dist_filter = dist_filter & interval_from_task.isin(
            subset_tasks).values
        print(
            f"Number of filtered regions after filter_subset_tasks: {dist_filter.sum()}"
        )

    # filter by chromosome
    if exclude_chr:
        logger.info(f"Excluding chromosomes: {exclude_chr}")
        chromosomes = d['metadata']['range']['chr']
        dist_filter = dist_filter & (
            ~pd.Series(chromosomes).isin(exclude_chr)).values
    # -------------------------------------------------------------
    # setup importance scores

    if seqmodel:
        thr_one_hot = one_hot[dist_filter]
        thr_hypothetical_contribs = {
            f"{task}/{gt}":
            d['hyp_imp'][task][gt.split("/")[0]][gt.split("/")[1]][dist_filter]
            for task in tasks for gt in grad_type.split(",")
        }
        thr_contrib_scores = {
            f"{task}/{gt}":
            thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
            for task in tasks for gt in grad_type.split(",")
        }
        task_names = [
            f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
        ]

    else:
        if merge_tasks:
            thr_one_hot = np.concatenate([
                one_hot[dist_filter] for task in tasks
                for gt in grad_type.split(",")
            ])
            thr_hypothetical_contribs = {
                "merged":
                np.concatenate([
                    mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks
                    for gt in grad_type.split(",")
                ])
            }

            thr_contrib_scores = {
                "merged": thr_hypothetical_contribs['merged'] * thr_one_hot
            }
            task_names = ['merged']
        else:
            thr_one_hot = one_hot[dist_filter]
            thr_hypothetical_contribs = {
                f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[dist_filter]
                for task in tasks for gt in grad_type.split(",")
            }
            thr_contrib_scores = {
                f"{task}/{gt}":
                thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
                for task in tasks for gt in grad_type.split(",")
            }
            task_names = [
                f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
            ]

    if null_imp_scores is not None:
        logger.info(f"Using null_imp_scores: {null_imp_scores}")
        null_isf = ImpScoreFile(null_imp_scores)
        null_per_pos_scores = {
            f"{task}/{gt}": v.sum(axis=-1)
            for gt in grad_type.split(",")
            for task, v in null_isf.get_contrib(imp_score=gt).items()
            if task in tasks
        }
    else:
        # default Null distribution. Requires modisco 5.0
        logger.info(f"Using default null_imp_scores")
        null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(
            num_to_samp=10000)

    # -------------------------------------------------------------
    # run modisco
    tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
        # Modisco defaults
        sliding_window_size=hp.sliding_window_size,
        flank_size=hp.flank_size,
        target_seqlet_fdr=hp.target_seqlet_fdr,
        min_passing_windows_frac=hp.min_passing_windows_frac,
        max_passing_windows_frac=hp.max_passing_windows_frac,
        min_metacluster_size=hp.min_metacluster_size,
        max_seqlets_per_metacluster=hp.max_seqlets_per_metacluster,
        seqlets_to_patterns_factory=modisco.tfmodisco_workflow.
        seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=hp.trim_to_window_size,  # default: 30
            initial_flank_to_add=hp.initial_flank_to_add,  # default: 10
            kmer_len=hp.kmer_len,  # default: 8
            num_gaps=hp.num_gaps,  # default: 3
            num_mismatches=hp.num_mismatches,  # default: 2
            n_cores=num_workers,
            final_min_cluster_size=hp.final_min_cluster_size)  # default: 30
    )(
        task_names=task_names,
        contrib_scores=thr_contrib_scores,  # -> task score
        hypothetical_contribs=thr_hypothetical_contribs,
        one_hot=thr_one_hot,
        null_per_pos_scores=null_per_pos_scores)
    # -------------------------------------------------------------
    # save the results
    grp = h5py.File(output_path)
    tfmodisco_results.save_hdf5(grp)