def evaluate_gap_position( L_mer, gap_index, sequences, bg_L_mer_scores, pssm_scores, comp_pssm_scores, options ): "Evaluate a k-mer with a gap in a particular position." gap_dist = nucleo_dist_from_mer(L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=gap_index) gap_pssm = hmm.calculate_log_scores(gap_dist) gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm) gap_pssm_scores = hmm.max_scores_in_sequences(gap_pssm, sequences, bg_L_mer_scores) gap_comp_pssm_scores = hmm.max_scores_in_sequences(gap_comp_pssm, sequences, bg_L_mer_scores) max_scores_per_pssm = numpy.array( [ pssm_scores, comp_pssm_scores, gap_pssm_scores, gap_comp_pssm_scores ] ) best_scores = max_scores_per_pssm.max(axis=0) best_scores[best_scores<0.] = 0. # when we didn't find sites, ignore score = best_scores.sum() / len(L_mer) / len(best_scores) logging.debug( 'Evaluated: %s; gap: %d; score: %f', numpy_to_seq(L_mer), gap_index, score ) return score
def evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options): dist = nucleo_dist_from_mer( L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=None ) pssm = hmm.calculate_log_scores(dist) comp_pssm = hmm.calculate_complementary_scores(pssm) pssm_scores = hmm.max_scores_in_sequences(pssm, sequences, bg_L_mer_scores) comp_pssm_scores = hmm.max_scores_in_sequences(comp_pssm, sequences, bg_L_mer_scores) return max( ( evaluate_gap_position( L_mer, gap_index, sequences, bg_L_mer_scores, pssm_scores, comp_pssm_scores, options ), gap_index ) for gap_index in gap_positions )
def evaluate_initialisation_K_mer(self, K_mer, sequences): """ Find out how good a starting point this K_mer would be. """ pssms = self.pssms_for_K_mer(K_mer) max_scores_per_pssm = numpy.array([hmm.max_scores_in_sequences(pssm, sequences) for pssm in pssms]) best_scores = max_scores_per_pssm.max(axis=0) null_score = (self.K + 1) * math.log(0.25) best_scores[best_scores < null_score] = null_score return best_scores.sum()
N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed) max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed) logging.info( 'Max scores (and complementary scores) over sequences took %f secs' % (time.time() - start)) long_seq = R.random_integers(0, 4, size=1000000) long_seq_preprocessed = hmm.preprocess_sequence(long_seq) logging.info('Starting to time max scores on long sequence.') start = time.time() logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed)) logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed)) logging.info( 'Max scores (and complementary scores) for %d bases took %f secs' %
logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed) max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed) logging.info('Max scores (and complementary scores) over sequences took %f secs' % (time.time()-start)) long_seq = R.random_integers(0,4,size=1000000) long_seq_preprocessed = hmm.preprocess_sequence(long_seq) logging.info('Starting to time max scores on long sequence.') start = time.time() logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed)) logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed)) logging.info('Max scores (and complementary scores) for %d bases took %f secs' % (len(long_seq), time.time()-start))