Python Scanner.count примеры, gimmemotifs.scanner.Scanner.count Python примеры использования

Пример #1

0

Показать файл

Файл: maelstrom.py Проект: NHLBI-BCB/gimmemotifs

def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)

    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)

Пример #2

0

Показать файл

Файл: maelstrom.py Проект: YichaoOU/gimmemotifs

def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)
    
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)
   
    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)

Пример #3

0

Показать файл

Файл: maelstrom.py Проект: quan-xu/gimmemotifs

def scan_to_table(input_table,
                  genome,
                  data_dir,
                  scoring,
                  pwmfile=None,
                  ncpus=None):
    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR, genome=genome)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)

Пример #4

0

Показать файл

def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)

Пример #5

0

Показать файл

def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95):
    """ Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    
    inputfile : str
        File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'classic', 'ks', 'lasso', 
        'lightning', 'mara', 'rf'. Default is 'classic'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    cutoff : float, optional
        Cutoff for motif scanning
    
    Returns
    -------
    
    pandas DataFrame with motif activity
    """

    if scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")
    
    config = MotifConfig()

    m2f = None
    
    # read data
    df = pd.read_table(inputfile, index_col=0)

    if method in CLUSTER_METHODS:
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                    "columns should all be numeric for {}".format(method))
        if method not in VALUE_METHODS:
            raise ValueError("method {} not valid".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
        
        if pwmfile is None:
            raise ValueError("no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0)

        # initialize scanner
        s = Scanner()
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            for row in s.count(list(df.index), cutoff=cutoff):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0)   

    motifs = motifs.loc[df.index]
    
    clf = None
    if method == "ks":
        clf = KSMoap()
    if method == "mwu":
        clf = MWMoap()
    if method == "rf":
        clf = RFMoap()
    if method == "lasso":
        clf = LassoMoap()
    if method == "lightning":
        clf = LightningMoap()
    if method == "mara":
        clf = MaraMoap()
    if method == "more":
        clf = MoreMoap()
    if method == "classic":
        clf = ClassicMoap()

    clf.fit(motifs, df)
    
    if outfile:
        with open(outfile, "w") as f:
            f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))
        
        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_

Пример #6

0

Показать файл

Файл: moap.py Проект: Jessica-2019/gimmemotifs

def moap(inputfile,
         method="hypergeom",
         scoring=None,
         outfile=None,
         motiffile=None,
         pwmfile=None,
         genome=None,
         fpr=0.01,
         ncpus=None,
         subsample=None):
    """Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    inputfile : str
        :1File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'hypergeom', 'lasso', 
        'lightningclassification', 'lightningregressor', 'bayesianridge', 
        'rf', 'xgboost'. Default is 'hypergeom'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    fpr : float, optional
        FPR for motif scanning
    
    ncpus : int, optional
        Number of threads to use. Default is the number specified in the config.
    
    Returns
    -------
    pandas DataFrame with motif activity
    """

    if scoring and scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")

    config = MotifConfig()

    if inputfile.endswith("feather"):
        df = pd.read_feather(inputfile)
        df = df.set_index(df.columns[0])
    else:
        # read data
        df = pd.read_table(inputfile, index_col=0, comment="#")

    clf = Moap.create(method, ncpus=ncpus)

    if clf.ptype == "classification":
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")

        pwmfile = pwmfile_location(pwmfile)
        try:
            motifs = read_motifs(pwmfile)
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        # initialize scanner
        s = Scanner(ncpus=ncpus)
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)
        s.set_background(genome=genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(pwmfile)]
        scores = []
        if method == 'classic' or scoring == "count":
            s.set_threshold(fpr=fpr)
            for row in s.count(list(df.index)):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index), normalize=True):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0, comment="#")

    if outfile and os.path.exists(outfile):
        out = pd.read_table(outfile, index_col=0, comment="#")
        ncols = df.shape[1]
        if ncols == 1:
            ncols = len(df.iloc[:, 0].unique())

        if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols:
            logger.warn("%s output already exists... skipping", method)
            return out

    if subsample is not None:
        n = int(subsample * df.shape[0])
        logger.debug("Subsampling %d regions", n)
        df = df.sample(n)

    motifs = motifs.loc[df.index]

    if method == "lightningregressor":
        outdir = os.path.dirname(outfile)
        tmpname = os.path.join(outdir, ".lightning.tmp")
        clf.fit(motifs, df, tmpdir=tmpname)
        shutil.rmtree(tmpname)
    else:
        clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(__version__))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_

Пример #7

0

Показать файл

Файл: maelstrom.py Проект: simonvh/gimmemotifs

def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.
    
    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a 
        genomepy genome name.
    
    scoring : str
        "count" or "score"
    
    pwmfile : str, optional
        Specify a PFM file for scanning.
    
    ncpus : int, optional
        If defined this specifies the number of cores to use.
    
    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:,0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index
    
    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    s.set_background(genome=genome)
    
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions, normalize=True):
            scores.append(row)
        logger.info("done")
   
    motif_names = [m.id for m in read_motifs(pwmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)

Пример #8

0

Показать файл

def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, 
        scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None):
    motifs = pwmfile_to_motifs(pwmfile)
    
    index_dir = None
    if genome is not None:
        index_dir = os.path.join(MotifConfig().get_index_dir(), genome) 
    
    # initialize scanner
    s = Scanner()
    s.set_motifs(pwmfile)
    
    fa = as_fasta(inputfile, index_dir)
    
    if moods:
        result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table)
    else:
        result_it = s.scan(fa, nreport, scan_rc, cutoff)

    
    if table:
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        
        if moods:
            result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile,  nreport, scan_rc, pvalue, table)
            for seq_id, counts in result_it:
                yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts]))
        else:
            # get iterator
            result_it = s.count(fa, nreport, scan_rc, cutoff)
            # counts table
            for i, counts in enumerate(result_it):
                yield "{}\t{}".format(
                        fa.ids[i], 
                        "\t".join([str(x) for x in counts])
                        )

    elif score_table:
        # get iterator
        result_it = s.best_score(fa, scan_rc)
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        # score table
        for i,scores in enumerate(result_it):
            yield "{}\t{}".format(
                    fa.ids[i], 
                    "\t".join([str(x) for x in scores])
                    )

    else:
        if moods:
            for motif, d in result_it:
                for seq_id,matches in d.items():
                    for pos,score,strand in matches:
                        yield format_line(fa, seq_id, motif,
                                score, pos, strand, bed=bed)
        else:
            for i, result in enumerate(result_it):
                seq_id = fa.ids[i]
                for motif, matches in zip(motifs, result):
                    for (score, pos, strand) in matches:
                        yield format_line(fa, seq_id, motif, 
                                   score, pos, strand, bed=bed)

Пример #9

0

Показать файл

Файл: moap.py Проект: NHLBI-BCB/gimmemotifs

def moap(inputfile,
         method="classic",
         scoring="score",
         outfile=None,
         motiffile=None,
         pwmfile=None,
         genome=None,
         cutoff=0.95):
    """ Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    
    inputfile : str
        File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'classic', 'ks', 'lasso', 
        'lightning', 'mara', 'rf'. Default is 'classic'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    cutoff : float, optional
        Cutoff for motif scanning
    
    Returns
    -------
    
    pandas DataFrame with motif activity
    """

    if scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")

    config = MotifConfig()

    m2f = None

    # read data
    df = pd.read_table(inputfile, index_col=0)

    if method in CLUSTER_METHODS:
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))
        if method not in VALUE_METHODS:
            raise ValueError("method {} not valid".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

        if pwmfile is None:
            raise ValueError(
                "no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0)

        # initialize scanner
        s = Scanner()
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            for row in s.count(list(df.index), cutoff=cutoff):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0)

    clf = None
    if method == "ks":
        clf = KSMoap()
    if method == "mwu":
        clf = MWMoap()
    if method == "rf":
        clf = RFMoap()
    if method == "lasso":
        clf = LassoMoap()
    if method == "lightning":
        clf = LightningMoap()
    if method == "mara":
        clf = MaraMoap()
    if method == "more":
        clf = MoreMoap()
    if method == "classic":
        clf = ClassicMoap()

    clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_

Python Scanner.count примеры использования