示例#1
0
def load_ranges(modisco_dir):
    modisco_dir = Path(modisco_dir)
    included_samples = load_included_samples(modisco_dir)

    kwargs = read_json(modisco_dir / "modisco-run.kwargs.json")
    d = ContribFile(kwargs["contrib_file"], included_samples)
    df = d.get_ranges()
    d.close()
    return df
示例#2
0
    def from_modisco_dir(cls, modisco_dir, ignore_include_samples=False):
        from bpnet.cli.modisco import load_included_samples, load_contrib_type
        from bpnet.utils import read_json
        if ignore_include_samples:
            include_samples = None
        else:
            include_samples = load_included_samples(modisco_dir)
            if include_samples.all():
                # All are true, we can ignore that
                include_samples = None

        modisco_kwargs = read_json(
            os.path.join(modisco_dir, "modisco-run.kwargs.json"))
        contrib_type = load_contrib_type(modisco_kwargs)

        return cls(modisco_kwargs["contrib_file"],
                   include_samples,
                   default_contrib_score=contrib_type)
示例#3
0
def bpnet_contrib(
        model_dir,
        output_file,
        method="grad",
        dataspec=None,
        regions=None,
        fasta_file=None,  # alternative to dataspec
        shuffle_seq=False,
        shuffle_regions=False,
        max_regions=None,
        # reference='zeroes', # Currently the only option
        # peak_width=1000,  # automatically inferred from 'config.gin.json'
        # seq_width=None,
        contrib_wildcard='*/profile/wn,*/counts/pre-act',  # specifies which contrib. scores to compute
        batch_size=512,
        gpu=0,
        memfrac_gpu=0.45,
        num_workers=10,
        storage_chunk_size=512,
        exclude_chr='',
        include_chr='',
        overwrite=False,
        skip_bias=False):
    """Run contribution scores for a BPNet model
    """
    from bpnet.extractors import _chrom_sizes
    add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(
                f"File exists {output_file}. Use overwrite=True to overwrite it"
            )

    config = read_json(os.path.join(model_dir, 'config.gin.json'))
    seq_width = config['seq_width']
    peak_width = config['seq_width']

    # NOTE - seq_width has to be the same for the input and the target
    #
    # infer from the command line
    # if seq_width is None:
    #     logger.info("Using seq_width = peak_width")
    #     seq_width = peak_width

    # # make sure these are int's
    # seq_width = int(seq_width)
    # peak_width = int(peak_width)

    # Split
    contrib_wildcards = contrib_wildcard.split(",")

    # Allow chr inclusion / exclusion
    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = None
    if include_chr:
        include_chr = include_chr.split(",")
    else:
        include_chr = None

    logger.info("Loading the config files")
    model_dir = Path(model_dir)

    logger.info("Creating the dataset")
    from bpnet.datasets import StrandedProfile, SeqClassification
    if fasta_file is not None:
        if regions is None:
            raise ValueError(
                "fasta_file specified. Expecting regions to be specified as well"
            )
        dl_valid = SeqClassification(
            fasta_file=fasta_file,
            intervals_file=regions,
            incl_chromosomes=include_chr,
            excl_chromosomes=exclude_chr,
            auto_resize_len=seq_width,
        )
        chrom_sizes = _chrom_sizes(fasta_file)
    else:
        if dataspec is None:
            logger.info("Using dataspec used to train the model")
            # Specify dataspec
            dataspec = model_dir / "dataspec.yml"

        ds = DataSpec.load(dataspec)
        dl_valid = StrandedProfile(ds,
                                   incl_chromosomes=include_chr,
                                   excl_chromosomes=exclude_chr,
                                   intervals_file=regions,
                                   peak_width=peak_width,
                                   shuffle=False,
                                   seq_width=seq_width)
        chrom_sizes = _chrom_sizes(ds.fasta_file)

    # Setup contribution score trimming (not required currently)
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        logger.info("Trimming the output")
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")

    seqmodel = SeqModel.from_mdir(model_dir)

    # get all possible interpretation names
    # make sure they match the specified glob
    intp_names = [
        name for name, _ in seqmodel.get_intp_tensors(preact_only=False)
        if fnmatch_any(name, contrib_wildcards)
    ]
    logger.info(f"Using the following interpretation targets:")
    for n in intp_names:
        print(n)

    if max_regions is not None:
        if len(dl_valid) > max_regions:
            logging.info(
                f"Using {max_regions} regions instead of the original {len(dl_valid)}"
            )
        else:
            logging.info(
                f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. "
                "Using the dataset size for max-regions")
            max_regions = len(dl_valid)
    else:
        max_regions = len(dl_valid)

    max_batches = np.ceil(max_regions / batch_size)

    writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size)
    for i, batch in enumerate(
            tqdm(dl_valid.batch_iter(batch_size=batch_size,
                                     shuffle=shuffle_regions,
                                     num_workers=num_workers),
                 total=max_batches)):
        # store the original batch containing 'inputs' and 'targets'
        if skip_bias:
            batch['inputs'] = {
                'seq': batch['inputs']['seq']
            }  # ignore all other inputs

        if max_batches > 0:
            if i > max_batches:
                break

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            batch['inputs']['seq'] = onehot_dinucl_shuffle(
                batch['inputs']['seq'])

        for name in intp_names:
            hyp_contrib = seqmodel.contrib_score(
                batch['inputs']['seq'],
                name=name,
                method=method,
                batch_size=None)  # don't second-batch

            # put contribution scores to the dictionary
            # also trim the contribution scores appropriately so that
            # the output will always be w.r.t. the peak center
            batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end]

        # trim the sequence as well
        # Trim the sequence
        batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end]

        # ? maybe it would it be better to have an explicit ContribFileWriter.
        # that way the written schema would be fixed
        writer.batch_write(batch)

    # add chromosome sizes
    writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes)
    writer.close()
    logger.info(f"Done. Contribution score file was saved to: {output_file}")
示例#4
0
def chip_nexus_analysis(modisco_dir,
                        trim_frac=0.08,
                        num_workers=20,
                        run_cwm_scan=False,
                        force=False,
                        footprint_width=200):
    """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs:
    - modisco_plot
    - modisco_report
    - modisco_table
    - modisco_export_patterns
    - cwm_scan
    - modisco_export_seqlets

    Note:
      All the sub-commands are only executed if they have not been ran before. Use --force override this.
      Whether the commands have been run before is deterimined by checking if the following file exists:
        `{modisco_dir}/.modisco_report_all/{command}.done`.
    """
    plt.switch_backend('agg')
    from bpnet.utils import ConditionalRun

    modisco_dir = Path(modisco_dir)
    # figure out the contribution scores used
    kwargs = read_json(modisco_dir / "modisco-run.kwargs.json")
    contrib_scores = kwargs["contrib_file"]

    mf = ModiscoFile(f"{modisco_dir}/modisco.h5")
    all_patterns = mf.pattern_names()
    mf.close()
    if len(all_patterns) == 0:
        print("No patterns found.")
        # Touch modisco-chip.html for snakemake
        open(modisco_dir / 'modisco-chip.html', 'a').close()
        open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close()
        return

    # class determining whether to run the command or not (poor-man's snakemake)
    cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force)

    sync = []
    # --------------------------------------------
    if (not cr.set_cmd('modisco_plot').done()
            or not cr.set_cmd('modisco_enrich_patterns').done()):
        # load ContribFile and pass it to all the functions
        logger.info("Loading ContribFile")
        contribsf = ContribFile.from_modisco_dir(modisco_dir)
        contribsf.cache()
    else:
        contribsf = None
    # --------------------------------------------
    # Basic reports
    if not cr.set_cmd('modisco_plot').done():
        modisco_plot(modisco_dir,
                     modisco_dir / 'plots',
                     heatmap_width=footprint_width,
                     figsize=(10, 10),
                     contribsf=contribsf)
        cr.write()
    sync.append("plots")

    if not cr.set_cmd('modisco_report').done():
        modisco_report(str(modisco_dir), str(modisco_dir))
        cr.write()
    sync.append("modisco-chip.html")

    if not cr.set_cmd('modisco_table').done():
        modisco_table(modisco_dir,
                      contrib_scores,
                      modisco_dir,
                      report_url=None,
                      contribsf=contribsf,
                      footprint_width=footprint_width)
        cr.write()
    sync.append("footprints.pkl")
    sync.append("pattern_table.*")

    if not cr.set_cmd('modisco_export_patterns').done():
        modisco_export_patterns(modisco_dir,
                                output_file=modisco_dir / 'patterns.pkl',
                                contribsf=contribsf)
        cr.write()
    sync.append("patterns.pkl")

    # --------------------------------------------
    # Finding new instances
    if run_cwm_scan:
        if not cr.set_cmd('cwm_scan').done():
            cwm_scan(modisco_dir,
                     modisco_dir / 'instances.bed.gz',
                     trim_frac=trim_frac,
                     contrib_file=None,
                     num_workers=num_workers)
            cr.write()

    # --------------------------------------------
    # Export bed-files and bigwigs

    # Seqlets
    if not cr.set_cmd('modisco_export_seqlets').done():
        modisco_export_seqlets(str(modisco_dir),
                               str(modisco_dir / 'seqlets'),
                               trim_frac=trim_frac)
        cr.write()
    sync.append("seqlets")

    # print the rsync command to run in order to sync the output
    # directories to the webserver
    logger.info("Run the following command to sync files to the webserver")
    dirs = " ".join(sync)
    print(f"rsync -av --progress {dirs} <output_dir>/")
示例#5
0
def cwm_scan(modisco_dir,
             output_file,
             trim_frac=0.08,
             patterns='all',
             filters='match_weighted_p>=.2,contrib_weighted_p>=.01',
             contrib_file=None,
             add_profile_features=False,
             num_workers=10):
    """Get motif instances via CWM scanning.
    """
    from bpnet.modisco.utils import longer_pattern, shorten_pattern
    from bpnet.modisco.pattern_instances import annotate_profile_single
    add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan')
    modisco_dir = Path(modisco_dir)

    valid_suffixes = [
        '.csv',
        '.csv.gz',
        '.tsv',
        '.tsv.gz',
        '.parq',
        '.bed',
        '.bed.gz',
    ]
    if not any([output_file.endswith(suffix) for suffix in valid_suffixes]):
        raise ValueError(
            f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}"
        )

    # Centroid matches path
    cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz'

    # save the hyper-parameters
    kwargs_json_file = os.path.join(os.path.dirname(output_file),
                                    'cwm-scan.kwargs.json')
    write_json(
        dict(modisco_dir=os.path.abspath(str(contrib_file)),
             output_file=str(output_file),
             cwm_scan_seqlets_path=str(cm_path),
             trim_frac=trim_frac,
             patterns=patterns,
             filters=filters,
             contrib_file=contrib_file,
             add_profile_features=add_profile_features,
             num_workers=num_workers), str(kwargs_json_file))

    # figure out contrib_wildcard
    modisco_kwargs = read_json(
        os.path.join(modisco_dir, "modisco-run.kwargs.json"))
    contrib_type = load_contrib_type(modisco_kwargs)

    mf = ModiscoFile(modisco_dir / "modisco.h5")
    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if contrib_file is None:
        cf = ContribFile.from_modisco_dir(modisco_dir)
        cf.cache(
        )  # cache it since it can be re-used in `modisco_centroid_seqlet_matches`
    else:
        logger.info(f"Loading the contribution scores from: {contrib_file}")
        cf = ContribFile(contrib_file, default_contrib_score=contrib_type)

    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        cwm_scan_seqlets(modisco_dir,
                         output_file=cm_path,
                         trim_frac=trim_frac,
                         contribsf=cf if contrib_file is None else None,
                         num_workers=num_workers,
                         verbose=False)
    else:
        logger.info("Centroid matches already exist.")
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    # get the raw data
    seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges()

    logger.info("Scanning for patterns")
    dfl = []

    # patterns to scan. `longer_pattern` makes sure the patterns are in the long format
    scan_patterns = patterns.split(
        ",") if patterns is not 'all' else mf.pattern_names()
    scan_patterns = [longer_pattern(pn) for pn in scan_patterns]

    if add_profile_features:
        profile = cf.get_profiles()
        logger.info("Profile features will also be added to dfi")

    for pattern_name in tqdm(mf.pattern_names()):
        if pattern_name not in scan_patterns:
            # skip scanning that patterns
            continue
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, contribution = pattern.scan_contribution(contrib,
                                                        hyp_contrib=None,
                                                        tasks=tasks,
                                                        n_jobs=num_workers,
                                                        verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            contribution,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        for filt in filters.split(","):
            if len(filt) > 0:
                dfm = dfm.query(filt)

        if add_profile_features:
            dfm = annotate_profile_single(dfm,
                                          pattern_name,
                                          mf,
                                          profile,
                                          profile_width=70,
                                          trim_frac=trim_frac)
        dfm['pattern_short'] = shorten_pattern(pattern_name)

        # TODO - is it possible to write out the results incrementally?
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    # add the absolute coordinates
    dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start']
    dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end']

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )

    # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...)
    bed_columns = [
        'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern',
        'contrib_weighted_p', 'strand', 'match_weighted_p'
    ]
    dfp = pd_first_cols(dfp, bed_columns)

    # write to a parquet file
    if output_file.endswith(".parq"):
        logger.info("Writing a parquet file")
        dfp.to_parquet(output_file,
                       partition_on=['pattern_short'],
                       engine='fastparquet')
    elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"):
        logger.info("Writing a csv file")
        dfp.to_csv(output_file, compression='infer', index=False)
    elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"):
        logger.info("Writing a tsv file")
        dfp.to_csv(output_file, sep='\t', compression='infer', index=False)
    elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"):
        logger.info("Writing a BED file")
        # write only the first (and main) 7 columns
        dfp[bed_columns].to_csv(output_file,
                                sep='\t',
                                compression='infer',
                                index=False,
                                header=False)
    else:
        logger.warn("File suffix not recognized. Using .csv.gz file format")
        dfp.to_csv(output_file, compression='gzip', index=False)
    logger.info("Done!")