예제 #1
0
def load_seqs(filename):
    "Load and convert sequences from fasta file."
    logging.info('Loading sequences: %s', filename)
    sequences = dict(sequences_from_fasta(filename))
    numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems())
    tally = sum(imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems())))
    logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values()))))
    return numpy_seqs, tally
예제 #2
0
    def test_traits(self):
        from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy
        from infpy import check_is_close_2

        p_binding_site = .01
        num_background_states = 2
        emission_dists = [
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 1.],
            [0., 0., 0., 1.],
            [0., 0., 0., 1.],
            [0., 0., 1., 0.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 0., 1.],
        ]
        K = len(emission_dists)
        test_seq = 'accagtttgcact'  # matches dist above
        test_seq_order_0 = seq_to_numpy(test_seq)

        # for various different orders
        for order in [1, 2]:

            # build a model of distribution above
            traits = PssmTraits(K,
                                p_binding_site,
                                order,
                                num_background_states,
                                create_background_model,
                                emission_dists=emission_dists)
            model = traits.new_model()
            converted = hmm.model_states_2_model(model)
            B = converted.B

            # check the reverse complement states are correct
            for n in xrange(model.N):
                for o in xrange(model.M):
                    rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(
                        n, o)
                    assert check_is_close_2(
                        B[rev_comp_state, rev_comp_obs],
                        B[n, o]), ('%d,%d %d,%d: %f %f' %
                                   (rev_comp_state, rev_comp_obs, n, o,
                                    B[rev_comp_state, rev_comp_obs], B[n, o]))

            # check viterbi gives correct result
            test_seq_order_n = converted.converter.to_order_n(test_seq_order_0)
            LL, states = converted.viterbi(test_seq_order_n)
            for i, state in enumerate(states):
                assert state == num_background_states + i
예제 #3
0
def evaluate_model(model, sequence):
    """
    Evaluates the model against the sequence.

    @return: True if there is at least one hit in the sequence
    """
    hmm, traits = model
    LL, states = hmm.viterbi(seq_to_numpy(sequence))
    # we have a hit if we find at least K/2 states in the state sequence that are not in the
    # background
    return sum(state not in traits.background_states for state in states) > traits.K / 2
예제 #4
0
def load_seqs(filename):
    "Load and convert sequences from fasta file."
    logging.info('Loading sequences: %s', filename)
    sequences = dict(sequences_from_fasta(filename))
    numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq)))
                      for desc, (seq, tally) in sequences.iteritems())
    tally = sum(
        imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems())))
    logging.info('Loaded %d sequences with %d bases', len(sequences),
                 sum(imap(len, (seq for seq, tally in sequences.values()))))
    return numpy_seqs, tally
예제 #5
0
def evaluate_model(model, sequence):
    """
    Evaluates the model against the sequence.

    @return: True if there is at least one hit in the sequence
    """
    hmm, traits = model
    LL, states = hmm.viterbi(seq_to_numpy(sequence))
    # we have a hit if we find at least K/2 states in the state sequence that are not in the
    # background
    return sum(state not in traits.background_states
               for state in states) > traits.K / 2
예제 #6
0
    def test_traits(self):
        from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy
        from infpy import check_is_close_2

        p_binding_site = .01
        num_background_states = 2
        emission_dists = [
          [ 1., 0., 0., 0. ],
          [ 0., 1., 0., 0. ],
          [ 0., 1., 0., 0. ],
          [ 1., 0., 0., 0. ],
          [ 0., 0., 1., 0. ],
          [ 0., 0., 0., 1. ],
          [ 0., 0., 0., 1. ],
          [ 0., 0., 0., 1. ],
          [ 0., 0., 1., 0. ],
          [ 0., 1., 0., 0. ],
          [ 1., 0., 0., 0. ],
          [ 0., 1., 0., 0. ],
          [ 0., 0., 0., 1. ],
        ]
        K = len(emission_dists)
        test_seq = 'accagtttgcact' # matches dist above
        test_seq_order_0 = seq_to_numpy(test_seq)

        # for various different orders
        for order in [1, 2]:

            # build a model of distribution above
            traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists)
            model = traits.new_model()
            converted = hmm.model_states_2_model(model)
            B = converted.B

            # check the reverse complement states are correct
            for n in xrange(model.N):
                for o in xrange(model.M):
                    rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o)
                    assert check_is_close_2(B[rev_comp_state,rev_comp_obs], B[n,o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o]))

            # check viterbi gives correct result
            test_seq_order_n = converted.converter.to_order_n(test_seq_order_0)
            LL, states = converted.viterbi(test_seq_order_n)
            for i, state in enumerate(states):
                assert state == num_background_states+i
예제 #7
0
            '# site sequence;sequence;start;is reverse complement; state sequence\n'
        )
        for seq_idx, seq_sites in enumerate(sites):
            for site, states, start, is_rev_comp in seq_sites:
                if is_rev_comp:
                    site = hmm.pssm.rev_comp(site)
                f.write('%s;%d;%d;%d;%s\n' %
                        (hmm.pssm.numpy_to_seq(site), seq_idx, start,
                         is_rev_comp, states))


if '__main__' == __name__:
    from hmm.pssm import PssmTraits, create_background_model, seq_to_numpy, random_sequence, information_content
    from random import random

    site = seq_to_numpy('aaactcaa')
    K = len(site)
    rev_comp_site = seq_to_numpy('ttgagttt')
    num_seqs = 60
    seq_length = K + 30
    start = 20

    def gen_sequence():
        seq = random_sequence(seq_length)
        if random() > .5:
            seq[start:start + K] = site
        else:
            seq[start:start + K] = rev_comp_site
        return seq

    p_binding_site = .01
예제 #8
0
option_parser.add_option(
  "--threshold-graph",
  dest="threshold_graph",
  help="file to write an image showing how # seqs with site varies by threshold."
)
# sys.argv='dummy.py -m /home/reid/T00759.pssm -s /home/reid/T00759.fa --threshold-graph test.png'.split()
options, args = option_parser.parse_args()
for option in option_parser.option_list:
    if option.dest:
        logging.info('%s: %s (%s)', option.dest, str(getattr(options, option.dest)), option.help)



logging.info('Loading sequences: %s', options.sequences_file)
sequences = dict(sequences_from_fasta(options.sequences_file))
numpy_seqs = dict((desc, seq_to_numpy(seq)) for desc, seq in sequences.iteritems())
logging.info('Loaded %d sequences', len(sequences))


logging.info('Parsing PSSMs: %s', options.models_file)
pssms = list(parse_models(open(options.models_file)))


logging.info('Building models')
models = [
  build_hmm_from_semi_parsed(parsed, p_binding_site=options.p_binding_site)
  for parsed in pssms
]

logging.info('Analysing sequences')
p_binding_sites = list()
예제 #9
0
파일: find_pssms.py 프로젝트: JohnReid/HMM
        f.write('# site sequence;sequence;start;is reverse complement; state sequence\n')
        for seq_idx, seq_sites in enumerate(sites):
            for site, states, start, is_rev_comp in seq_sites:
                if is_rev_comp:
                    site = hmm.pssm.rev_comp(site)
                f.write('%s;%d;%d;%d;%s\n' % (hmm.pssm.numpy_to_seq(site), seq_idx, start, is_rev_comp, states))





if '__main__' == __name__:
    from hmm.pssm import PssmTraits, create_background_model, seq_to_numpy, random_sequence, information_content
    from random import random

    site = seq_to_numpy('aaactcaa')
    K = len(site)
    rev_comp_site = seq_to_numpy('ttgagttt')
    num_seqs = 60
    seq_length = K + 30
    start = 20
    def gen_sequence():
        seq = random_sequence(seq_length)
        if random() > .5:
            seq[start:start+K] = site
        else:
            seq[start:start+K] = rev_comp_site
        return seq

    p_binding_site = .01
    order = 1