Exemplo n.º 1
0
def evaluate_gap_position(
  L_mer,
  gap_index,
  sequences,
  bg_L_mer_scores,
  pssm_scores,
  comp_pssm_scores,
  options
):
    "Evaluate a k-mer with a gap in a particular position."
    gap_dist = nucleo_dist_from_mer(L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=gap_index)
    gap_pssm = hmm.calculate_log_scores(gap_dist)
    gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm)
    gap_pssm_scores = hmm.max_scores_in_sequences(gap_pssm, sequences, bg_L_mer_scores)
    gap_comp_pssm_scores = hmm.max_scores_in_sequences(gap_comp_pssm, sequences, bg_L_mer_scores)
    max_scores_per_pssm = numpy.array(
        [
            pssm_scores,
            comp_pssm_scores,
            gap_pssm_scores,
            gap_comp_pssm_scores
        ]
    )
    best_scores = max_scores_per_pssm.max(axis=0)
    best_scores[best_scores<0.] = 0. # when we didn't find sites, ignore
    score = best_scores.sum() / len(L_mer) / len(best_scores)
    logging.debug(
        'Evaluated: %s; gap: %d; score: %f',
        numpy_to_seq(L_mer),
        gap_index,
        score
    )
    return score
Exemplo n.º 2
0
def evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options):
    dist = nucleo_dist_from_mer(
      L_mer,
      options.pseudo_count_for_L_mer_scoring,
      gap_index=None
    )
    pssm = hmm.calculate_log_scores(dist)
    comp_pssm = hmm.calculate_complementary_scores(pssm)
    pssm_scores = hmm.max_scores_in_sequences(pssm, sequences, bg_L_mer_scores)
    comp_pssm_scores = hmm.max_scores_in_sequences(comp_pssm, sequences, bg_L_mer_scores)
    return max(
      (
        evaluate_gap_position(
          L_mer,
          gap_index,
          sequences,
          bg_L_mer_scores,
          pssm_scores,
          comp_pssm_scores,
          options
        ),
        gap_index
      )
      for gap_index in gap_positions
    )
Exemplo n.º 3
0
 def evaluate_initialisation_K_mer(self, K_mer, sequences):
     """
     Find out how good a starting point this K_mer would be.
     """
     pssms = self.pssms_for_K_mer(K_mer)
     max_scores_per_pssm = numpy.array([hmm.max_scores_in_sequences(pssm, sequences) for pssm in pssms])
     best_scores = max_scores_per_pssm.max(axis=0)
     null_score = (self.K + 1) * math.log(0.25)
     best_scores[best_scores < null_score] = null_score
     return best_scores.sum()
Exemplo n.º 4
0
        N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores,
                                             long_seqs_preprocessed)
    max_comp_scores = hmm.max_scores_in_sequences(comp_scores,
                                                  long_seqs_preprocessed)
    logging.info(
        'Max scores (and complementary scores) over sequences took %f secs' %
        (time.time() - start))

    long_seq = R.random_integers(0, 4, size=1000000)
    long_seq_preprocessed = hmm.preprocess_sequence(long_seq)

    logging.info('Starting to time max scores on long sequence.')
    start = time.time()
    logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed))
    logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed))
    logging.info(
        'Max scores (and complementary scores) for %d bases took %f secs' %
Exemplo n.º 5
0
    logging.info('Calculating complementary scores')
    comp_scores = hmm.calculate_complementary_scores(pssm_scores)
    logging.info('Preprocessing sequence')
    seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed)
    max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed)
    logging.info('Max scores (and complementary scores) over sequences took %f secs' % (time.time()-start))

    long_seq = R.random_integers(0,4,size=1000000)
    long_seq_preprocessed = hmm.preprocess_sequence(long_seq)

    logging.info('Starting to time max scores on long sequence.')
    start = time.time()
    logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed))
    logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed))
    logging.info('Max scores (and complementary scores) for %d bases took %f secs' % (len(long_seq), time.time()-start))