def load_instances(parq_file, motifs=None, dedup=True):
    """Load pattern instances from the parquet file

    Args:
      parq_file: parquet file of motif instances
      motifs: dictionary of motifs of interest.
        key=custom motif name, value=short pattern name (e.g. 'm0_p3')

    """
    if motifs is not None:
        incl_motifs = {longer_pattern(m) for m in motifs.values()}
    else:
        incl_motifs = None

    if isinstance(parq_file, pd.DataFrame):
        dfi = parq_file
    else:
        if motifs is not None:
            from fastparquet import ParquetFile
            # Selectively load only the relevant patterns
            pf = ParquetFile(str(parq_file))
            if 'dir0' in pf.cats:
                # fix the wrong patterns
                metaclusters = list(
                    {'pattern=' + x.split("/")[0]
                     for x in incl_motifs})
                patterns = list({x.split("/")[1] for x in incl_motifs})
                dfi = pf.to_pandas(
                    filters=[("dir0", "in",
                              metaclusters), ("dir1", "in", patterns)])
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)
                del dfi['dir0']
                del dfi['dir1']
            else:
                dfi = pf.to_pandas(filters=[('pattern', 'in',
                                             list(incl_motifs))])

        else:
            dfi = pd.read_parquet(str(parq_file), engine='fastparquet')
            if 'pattern' not in dfi:
                # assumes a hive-stored file
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)

    # filter
    if motifs is not None:
        dfi = dfi[dfi.pattern.isin(
            incl_motifs)]  # NOTE this should already be removed
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in incl_motifs})
        dfi['pattern_name'] = dfi['pattern_short'].map(
            {v: k
             for k, v in motifs.items()})
    else:
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in dfi.pattern.unique()})

    # add some columns
    dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start']
    dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end']

    if dedup:
        # deduplicate
        dfi_dedup = dfi.drop_duplicates([
            'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs',
            'strand'
        ])

        # number of removed duplicates
        d = len(dfi) - len(dfi_dedup)
        print("number of de-duplicated instances:", d,
              f"({d / len(dfi) * 100}%)")

        # use de-duplicated instances from now on
        dfi = dfi_dedup
    return dfi
Exemplo n.º 2
0
def pattern_url(shortpattern, report_url):
    return a(shortpattern, href=report_url + "#" + longer_pattern(shortpattern)).to_html()
Exemplo n.º 3
0
 def parse_pattern_name(self, pattern_name):
     task, name = pattern_name.split("/", 1)
     if "/" not in name:
         name = longer_pattern(name)
     return task, name