Exemplo n.º 1
0
def modisco_table(modisco_dir, output_dir, report_url=None, impsf=None):
    """Write the pattern table to as .html and .csv
    """
    plt.switch_backend('agg')
    from basepair.modisco.table import ModiscoData, modisco_table, write_modisco_table
    from basepair.modisco.motif_clustering import hirearchically_reorder_table
    add_file_logging(output_dir, logger, 'modisco-table')
    print("Loading required data")
    data = ModiscoData.load(modisco_dir, imp_scores_h5=None, impsf=impsf)

    print("Generating the table")
    df = modisco_table(data)

    print("Writing the results")
    write_modisco_table(df, output_dir, report_url, 'pattern_table')

    print("Writing clustered table")
    write_modisco_table(hirearchically_reorder_table(df, data.tasks),
                        output_dir, report_url, 'pattern_table.sorted')

    print("Writing footprints")
    profiles = OrderedDict([(pattern, {
        task: data.get_profile_wide(pattern, task).mean(axis=0)
        for task in data.tasks
    }) for pattern in data.mr.patterns()])
    write_pkl(profiles, Path(output_dir) / 'footprints.pkl')
    print("Done!")
Exemplo n.º 2
0
def modisco_score(modisco_dir,
                  imp_scores,
                  output_tsv,
                  output_seqlets_pkl=None,
                  seqlet_len=25,
                  n_cores=1,
                  method="rank",
                  trim_pattern=False):
    """Find seqlet instances using modisco
    """
    add_file_logging(os.path.dirname(output_tsv), logger, 'modisco-score')
    mr, tasks, grad_type = load_modisco_results(modisco_dir)

    # load importance scores we want to score
    d = HDF5Reader.load(imp_scores)
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    hypothetical_contribs = {
        f"{task}/{gt}": mean(d['hyp_imp'][task][gt])
        for task in tasks for gt in grad_type.split(",")
    }
    contrib_scores = {
        f"{task}/{gt}": hypothetical_contribs[f"{task}/{gt}"] * one_hot
        for task in tasks for gt in grad_type.split(",")
    }

    seqlets = find_instances(mr,
                             tasks,
                             contrib_scores,
                             hypothetical_contribs,
                             one_hot,
                             seqlet_len=seqlet_len,
                             n_cores=n_cores,
                             method=method,
                             trim_pattern=trim_pattern)
    if len(seqlets) == 0:
        print("ERROR: no seqlets found!!")
        return [], None

    if output_seqlets_pkl:
        write_pkl(seqlets, output_seqlets_pkl)
    df = labelled_seqlets2df(seqlets)

    dfm = pd.DataFrame(d['metadata']['range'])
    dfm.columns = ["example_" + v for v in dfm.columns]

    df = df.merge(dfm,
                  left_on="example_idx",
                  how='left',
                  right_on="example_id")

    df.to_csv(output_tsv, sep='\t')

    return seqlets, df
Exemplo n.º 3
0
def modisco_enrich_patterns(patterns_pkl_file,
                            modisco_dir,
                            output_file,
                            impsf=None):
    """Add stacked_seqlet_imp to pattern `attrs`

    Args:
      patterns_pkl: patterns.pkl file path
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from basepair.utils import read_pkl, write_pkl
    from basepair.cli.imp_score import ImpScoreFile
    from basepair.modisco.core import StackedSeqletImp

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)
    patterns = read_pkl(patterns_pkl_file)

    mr = ModiscoResult(modisco_dir / 'modisco.h5')
    mr.open()

    if impsf is None:
        imp_file = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ImpScoreFile into memory")
        imp_file.cache()
    else:
        logger.info("Using the provided ImpScoreFile")
        imp_file = impsf

    logger.info("Extracting profile and importance scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()
        profile_width = p.len_profile()
        # get the shifted seqlets
        seqlets = [
            s.pattern_align(**p.attrs['align'])
            for s in mr._get_seqlets(p.name)
        ]

        # keep only valid seqlets
        valid_seqlets = [
            s for s in seqlets if s.valid_resize(profile_width,
                                                 imp_file.get_seqlen() + 1)
        ]
        # extract the importance scores
        p.attrs['stacked_seqlet_imp'] = imp_file.extract(
            valid_seqlets, profile_width=profile_width)

        p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/"))
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)
Exemplo n.º 4
0
def modisco_export_patterns(modisco_dir, output_file, impsf=None):
    """Export patterns to a pkl file. Don't cluster them

    Adds `stacked_seqlet_imp` and `n_seqlets` to pattern `attrs`

    Args:
      patterns_pkl: patterns.pkl file path
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from basepair.utils import read_pkl, write_pkl
    from basepair.cli.imp_score import ImpScoreFile
    from basepair.modisco.core import StackedSeqletImp

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)

    mr = ModiscoResult(modisco_dir / 'modisco.h5')
    mr.open()
    patterns = [mr.get_pattern(pname) for pname in mr.patterns()]

    if impsf is None:
        imp_file = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ImpScoreFile into memory")
        imp_file.cache()
    else:
        logger.info("Using the provided ImpScoreFile")
        imp_file = impsf

    logger.info("Extracting profile and importance scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()

        # get the shifted seqlets
        valid_seqlets = mr._get_seqlets(p.name)

        # extract the importance scores
        sti = imp_file.extract(valid_seqlets, profile_width=None)
        sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True)
        p.attrs['stacked_seqlet_imp'] = sti
        p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/"))
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)
Exemplo n.º 5
0
def generate_motif_data(dfab,
                        ref,
                        single_mut,
                        double_mut,
                        pairs,
                        output_dir,
                        tasks,
                        profile_width=200,
                        save=False,
                        pseudo_count_quantile=0.2,
                        profile_slice=slice(82, 119)):
    import gc
    from basepair.utils import write_pkl
    from basepair.exp.chipnexus.spacing import remove_edge_instances
    from basepair.exp.chipnexus.perturb.scores import (
        ism_compute_features_tidy, compute_features_tidy, SCORES,
        max_profile_count)
    if save:
        c_output_dir = os.path.join(output_dir, 'motif_pair_lpdata')
        os.makedirs(c_output_dir, exist_ok=True)

    dfabf_ism_l = []
    dfabf_l = []
    for motif_pair in pairs:
        motif_pair_name = "<>".join(motif_pair)
        dfab_subset = remove_edge_instances(
            dfab[dfab.motif_pair == motif_pair_name],
            profile_width=profile_width)
        pdata = ParturbationDataset(dfab_subset,
                                    ref,
                                    single_mut,
                                    double_mut,
                                    profile_width=profile_width)
        output = pdata.load_all(num_workers=0)
        output['dfab'] = dfab_subset

        # Compute the directionality and epistasis scores
        # Epistasis:
        o = {motif_pair_name: output}
        dfabf_ism_l.append(ism_compute_features_tidy(o, tasks))
        # Directional:
        dfabf_l.append(
            compute_features_tidy(o,
                                  tasks,
                                  SCORES,
                                  pseudo_count_quantile=pseudo_count_quantile,
                                  profile_slice=profile_slice))

        # motif_pair_lpdata[motif_pair_name] = output
        # sort_idx = np.argsort(pdata.dfab.center_diff)
        if save:
            write_pkl(output,
                      os.path.join(c_output_dir, motif_pair_name + '.pkl'))
        del o
        del output
        del pdata
        del dfab_subset
        print("Garbage collect")
        gc.collect()

    return pd.concat(dfabf_ism_l, axis=0), pd.concat(dfabf_l, axis=0)
Exemplo n.º 6
0
    if args.gpu is not None:
        create_tf_session(args.gpu)
    # create the output path
    cache_path = f"{models_dir}/{exp}/motif-simulation/spacing;correct={args.correct}.pkl"
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)

    # load the model
    logger.info("Loading model")
    model_dir = models_dir / exp
    bpnet = BPNetSeqModel.from_mdir(model_dir)

    logger.info("Creating the output directory")

    df_d = {}
    res_dict_d = {}
    for central_motif_name, central_motif in all_motif_seqs.items():
        logger.info(f"Runnig script for {central_motif_name}")
        # get the motifs
        res_dict = OrderedDict([(motif, generate_sim(bpnet, central_motif, side_motif, list(range(511, 511 + 150, 1)),
                                                     center_coords=center_coords,
                                                     repeat=repeat,
                                                     correct=args.correct,
                                                     importance=[]))  # 'counts/pre-act', 'profile/wn']))
                                for motif, side_motif in all_motif_seqs.items()])
        df = pd.concat([v[0].assign(motif=k) for k, v in res_dict.items()])  # stack the dataframes
        df_d[central_motif_name] = df
        res_dict_d[central_motif_name] = res_dict

    # Store all the results
    write_pkl((df_d, res_dict_d), cache_path)
Exemplo n.º 7
0
def modisco_score_single_binary(modisco_dir,
                                output_tsv,
                                output_seqlets_pkl=None,
                                seqlet_len=25,
                                n_cores=1,
                                method="rank",
                                trim_pattern=False):
    """
    Equivalent of modisco_score
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hypothetical_contribs = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib_scores = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }

    print(tasks)
    track_set = workflow.prep_track_set(
        task_names=tasks,
        contrib_scores=contrib_scores,
        hypothetical_contribs=hypothetical_contribs,
        one_hot=one_hot[included_samples])

    with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp:
        mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)

    seqlets = find_instances(mr,
                             tasks,
                             contrib_scores,
                             hypothetical_contribs,
                             one_hot[included_samples],
                             seqlet_len=seqlet_len,
                             n_cores=n_cores,
                             method=method,
                             trim_pattern=trim_pattern)

    if output_seqlets_pkl:
        write_pkl(seqlets, output_seqlets_pkl)
    df = labelled_seqlets2df(seqlets)

    dfm = pd.DataFrame(d['metadata']['range'])
    dfm.columns = ["example_" + v for v in dfm.columns]
    dfm['example_id'] = d['metadata']['interval_from_task']

    df = df.merge(dfm,
                  left_on="example_idx",
                  how='left',
                  right_on="example_id")

    df.to_csv(output_tsv, sep='\t')

    return seqlets, df
Exemplo n.º 8
0
 def save(self, file_path):
     """Save model to a file
     """
     from basepair.utils import write_pkl
     write_pkl(self, file_path)