Exemplo n.º 1
0
    def __call__(self, sequences):
        """
        Run the motif finding algorithm.
        """

        preprocessed_sequences = hmm.preprocess_sequences(sequences)

        # how big are the sequences
        num_bases = sum(len(s) for s in sequences)

        # find all K-mers collapsed with their reverse complements
        logging.info("Finding all %d-mers in sequences", self.init_K_mer_length)
        start = time.time()
        nmer_counts = hmm.ReverseComplementCollapsingCounter(self.init_K_mer_length)
        hmm.count_mers(sequences, n=self.init_K_mer_length, callback=nmer_counts)
        logging.info("Took %f seconds to find %d-mers", time.time() - start, self.init_K_mer_length)

        p_binding_site = (self.expected_sites_per_sequence * len(sequences)) / num_bases
        logging.info("Found %d %d-mers", nmer_counts.num_counts(), self.init_K_mer_length)
        start = time.time()
        best_starting_point = max(self.yield_evaluations(nmer_counts, preprocessed_sequences), key=lambda x: x[1])
        logging.info("Evaluation took %f seconds", time.time() - start)
        logging.info("Best starting point: %s: %f" % best_starting_point)

        model = self.model_for_initialisation_K_mer(best_starting_point[0], p_binding_site)
        logging.info("Running Baum-Welch")
        start = time.time()
        LL, num_iterations = model.baum_welch(preprocessed_sequences)
        logging.info("Baum-Welch took %f seconds", time.time() - start)
        logging.info("Achieved LL: %f in %d iterations", LL, num_iterations)

        return model
Exemplo n.º 2
0
    logging.info('Calculating complementary scores')
    comp_scores = hmm.calculate_complementary_scores(pssm_scores)
    logging.info('Preprocessing sequence')
    seq = hmm.preprocess_sequence(
        N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores,
                                             long_seqs_preprocessed)
    max_comp_scores = hmm.max_scores_in_sequences(comp_scores,
                                                  long_seqs_preprocessed)
    logging.info(
        'Max scores (and complementary scores) over sequences took %f secs' %
        (time.time() - start))

    long_seq = R.random_integers(0, 4, size=1000000)
    long_seq_preprocessed = hmm.preprocess_sequence(long_seq)

    logging.info('Starting to time max scores on long sequence.')
Exemplo n.º 3
0
    pssm_scores = hmm.calculate_log_scores(nucleo_dists)
    logging.info('Calculating complementary scores')
    comp_scores = hmm.calculate_complementary_scores(pssm_scores)
    logging.info('Preprocessing sequence')
    seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed)
    max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed)
    logging.info('Max scores (and complementary scores) over sequences took %f secs' % (time.time()-start))

    long_seq = R.random_integers(0,4,size=1000000)
    long_seq_preprocessed = hmm.preprocess_sequence(long_seq)

    logging.info('Starting to time max scores on long sequence.')
    start = time.time()
    logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed))
    logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed))
    logging.info('Max scores (and complementary scores) for %d bases took %f secs' % (len(long_seq), time.time()-start))