예제 #1
0
def sample(hyperparameters, rho, K, F):
    "Sample from the model."
    G = len(rho)
    
    q_alpha = GammaDist(hyperparameters.a_alpha, hyperparameters.b_alpha)
    alpha = q_alpha.sample()
    
    q_beta = GammaDist(hyperparameters.a_beta, hyperparameters.b_beta)
    beta = q_beta.sample()

    q_gamma = GammaDist(hyperparameters.a_gamma, hyperparameters.b_gamma)
    gamma = q_gamma.sample()

    q_lambda = GammaDist(hyperparameters.a_lambda, hyperparameters.b_lambda)
    lambda_ = q_lambda.sample()

    q_tau = DirichletDist(hyperparameters.a_tau)
    tau = q_tau.sample()

    q_omega = DirichletDist(hyperparameters.a_omega)
    omega = q_omega.sample()

    q_pi_bar = BetaDist(numpy.ones(K), gamma * numpy.ones(K))
    pi_bar = q_pi_bar.sample()
    
    pi = numpy.empty_like(pi_bar)
    for k in xrange(K-1):
        pi[k] = pi_bar[k] * (1.-pi_bar[:k]).prod()
    pi[-1] = 1. - pi[:-1].sum()
    if pi[-1] < 0.: # adjust for numerical errors
        pi[-1] = 0.
    
    theta = numpy.random.dirichlet(alpha * pi, size=G)
    phi = numpy.empty((K+1, F))
    phi[0] = numpy.random.dirichlet(lambda_ * omega)
    phi[1:] = numpy.random.dirichlet(beta * tau, size=K)

    # sample the correct number of sites for each gene    
    sites = [None] * G
    for g, rho_g in enumerate(rho):
        v_g = [bernoulli(rho_i) for rho_i in rho_g]
        z_g = [v_gi and discrete_sample(theta[g])+1 or 0 for v_gi in v_g]
        x_g = [discrete_sample(phi[z_gi]) for z_gi in z_g]
        sites[g] = (v_g, z_g, x_g)
    
    result_type = namedtuple('Sample', 'alpha beta gamma lambda_ tau omega pi_bar pi theta phi sites')
    
    return result_type(
        alpha=alpha, 
        beta=beta, 
        gamma=gamma,
        lambda_=lambda_, 
        tau=tau, 
        omega=omega, 
        pi_bar=pi_bar, 
        pi=pi, 
        theta=theta,
        phi=phi,
        sites=sites,
    )
예제 #2
0
def convert_base(b):
    if 'a' == b or 'A' == b: return 0
    if 'c' == b or 'C' == b: return 1
    if 'g' == b or 'G' == b: return 2
    if 't' == b or 'T' == b: return 3
    raise RuntimeError('Unknown base: %s' % str(b))


def convert_seq(seq):
    return map(convert_base, seq)


sites_filename = os.path.join('c:\\', 'Dev', 'MyProjects', 'Vincent',
                              'transfac_matrix_sites.txt')

TransfacSiteSet = namedtuple('TransfacSiteSet', 'matrix name seqs')


def load_transfac_sites(sites_filename=sites_filename):
    return dict((matrix, TransfacSiteSet(matrix=matrix, name=name, seqs=seqs))
                for matrix, name, seqs in read_sites(open(sites_filename)))


def convert_transfac_sites(sites):
    result = dict()
    for matrix, site_set in sites.iteritems():
        try:
            converted_seqs = map(convert_seq,
                                 site_set.seqs)  # convert to feature values
            if not converted_seqs:  # if no sequences ignore
                continue
예제 #3
0
            else:
                seqs.append(line)

def convert_base(b):
    if 'a' == b or 'A' == b: return 0
    if 'c' == b or 'C' == b: return 1
    if 'g' == b or 'G' == b: return 2
    if 't' == b or 'T' == b: return 3
    raise RuntimeError('Unknown base: %s' % str(b))

def convert_seq(seq):
    return map(convert_base, seq)

sites_filename = os.path.join('c:\\', 'Dev', 'MyProjects', 'Vincent', 'transfac_matrix_sites.txt')

TransfacSiteSet = namedtuple('TransfacSiteSet', 'matrix name seqs')
def load_transfac_sites(sites_filename=sites_filename):
    return dict(
      (matrix, TransfacSiteSet(matrix=matrix, name=name, seqs=seqs))
      for matrix, name, seqs
      in read_sites(open(sites_filename))
    )


def convert_transfac_sites(sites):
    result = dict()
    for matrix, site_set in sites.iteritems():
        try:
            converted_seqs = map(convert_seq, site_set.seqs) # convert to feature values
            if not converted_seqs: # if no sequences ignore
                continue
예제 #4
0
파일: meme.py 프로젝트: JohnReid/STEME
Code to run MEME algorithm.
"""


import subprocess
import re
import time
import warnings
from cookbook.interval import Interval
from stempy import ensure_dir_exists, logging, os, parse_options
from cookbook.named_tuple import namedtuple


logger = logging.getLogger(__name__)

Start = namedtuple(
    'Start', 'w0 nsites0 cons0 cons w nsites sig em_time niters cons_after_em')


if False:
    warnings.warn('Using debug MEME')
    _meme_binary = '/home/john/local/debug/bin/meme.bin'
else:
    _meme_binary = '/home/john/local/bin/meme.bin'


def run_meme(fasta, options, extra_args=None):
    """
    Runs MEME.
    """

    # set up command line
예제 #5
0
파일: spacing.py 프로젝트: JohnReid/STEME
"""
Code to analyse distances (spacings) between pairs of occurrences of TFs.
"""

import logging
logger = logging.getLogger(__name__)

from cookbook.named_tuple import namedtuple
import pyicl
import numpy
import pylab
from collections import defaultdict
from .scan import footprint
from scipy.special import gammaln

PairOccurrence = namedtuple('PairOccurrence', 'spacing seq pos strand')
Spacing = namedtuple(
    'Spacing', 'primary secondary same_strand upstream distance')


def add_max_distance_option(parser):
    parser.add_option(
        "-d",
        "--max-distance",
        type=int,
        default=30,
        help="Only look for occurrences of motifs up to MAX_DISTANCE "
        "base pairs apart",
        metavar="MAX_DISTANCE"
    )
예제 #6
0
파일: test_em.py 프로젝트: JohnReid/STEME
#

"""
Test read sequences.
"""

#
# Trickery to find update path to import stempy from
#
from setup_environment import init_test_env, fasta_dir
init_test_env(__file__)

import stempy, os
from cookbook.named_tuple import namedtuple

Start = namedtuple('Start', 'seed num_sites score model best_w_mers')

options = stempy.get_default_options()
options.output_dir = os.path.join('output', 'test-em')
seed = 'CACTTT'
W = len(seed)

# read the sequences and build STEME object from index
fasta = os.path.join(fasta_dir(), 'em-1-test.fa')
algorithm = stempy.Algorithm(options)
algorithm._initialise(fasta)
motif_finder = algorithm.create_motif_finder()

model = algorithm.create_model_of_input(W)
model.bs.seed(seed, True)
start = Start(seed=seed, num_sites=10, score=0., model=model, best_w_mers=stempy.InstanceVec())
예제 #7
0
파일: scan.py 프로젝트: JohnReid/STEME
logger = logging.getLogger(__name__)

from . import html_copy_static
import os
import pyicl
import pylab
import numpy
import bisect
from cookbook.named_tuple import namedtuple
from collections import defaultdict
from itertools import ifilter
from cookbook.pylab_utils import pylab_context_ioff, create_format_cycler
# from cookbook.pylab_utils import simple_marker_styles


SeqInfo = namedtuple('SeqInfo', 'name length')
Occurrence = namedtuple(
    'Occurrence', 'motif wmer seq pos strand Z score pvalue')


def footprint(occ):
    """Return the footprint (interval) of the occurrence.
    """
    return pyicl.IntInterval(occ.pos, occ.pos + len(occ.wmer))


def parse_occurrence(line):
    """Parse one occurrence in the format outputted by steme-pwm-scan.
    """
    fields = line.strip().split(',')
    if 8 != len(fields):