Exemplo n.º 1
0
    def _model_for_L_mer(self, L_mer, gap_index, p_binding_site):
        """
        Create a model initialised by this K-mer.
        """
        # get the start position of the K-mer and a builder to make the model
        mer_len = len(L_mer)
        start, builder = self._make_builder(gap_index, mer_len)

        # get the emission distribution
        nucleo_dist = nucleo_dist_from_mer(
          seq_to_numpy(L_mer),
          self.options.pseudo_count_for_model_initialisation,
          gap_index=gap_index
        )
        emissions = numpy.ones((self.options.K,4))/4.
        emissions[start:start+mer_len+1] = nucleo_dist

        # build the model
        pssm, in_states, out_states = builder.create(
          p_gap=.5,
          emissions=emissions
        )
        model = hmm.as_model(
          single_gap.add_to_simple_background_model(
            model=pssm,
            in_states=in_states,
            out_states=out_states,
            p_binding_site=p_binding_site
          )
        )
        #print model.A
        #from IPython.Debugger import Pdb; Pdb().set_trace();
        return model
Exemplo n.º 2
0
 def _distance_to_passed(self, k_mer):
     return min(self.distance(k_mer, seq_to_numpy(other_k_mer)) for other_k_mer in self.passed)
Exemplo n.º 3
0
def most_frequent_k_mers(sequences, K, max_to_return):
    result = list()
    for K_mer, count in yield_k_mers(sequences, K):
        if 0 == max_to_return:
            break
        result.append((K_mer, count))
        max_to_return -= 1
    return result


if '__main__' == __name__:
    #
    # Test distance metric between K-mers
    #
    from gapped_pssms.sequence import seq_to_numpy
    distance = K_mer_distance()
    K_mers = [
      ('acgtacgtg', 'gacgtacgt', 1), # 1 shift
      ('acgtacgtg', 'ggacgtacg', 2), # 2 shifts
      ('ggacgtacg', 'acgtacgtg', 2), # 2 shifts other way
      ('aaaaaaaaa', 'aaaaaaaaa', 0), # identical
      ('aaaaaaaaa', 'ttttttttt', 0), # reverse complement
      ('acgtacgtc', 'aaaaaaaaa', 7), # very different
    ]
    for k1, k2, d in K_mers:
        n1 = seq_to_numpy(k1)
        n2 = seq_to_numpy(k2)
        assert d == distance(n1, n2) # make sure is correct distance
        print '%s - %s : distance=%d' % (k1, k2, d)
        assert d == distance(n2, n1) # make sure is symmetrical
Exemplo n.º 4
0
def generate_seeds(
  sequences,
  preprocessed_sequences,
  options
):
    """
    Generate a list of candidate L-mers and score them to find the best seed L-mer and gap position.
    """
    # if we have been given a background model filename and it exists then load it.
    if None != options.bg_model_filename and os.path.exists(options.bg_model_filename):
        logging.info("Loading supplied background model from %s", options.bg_model_filename)
        bg_model = cPickle.load(open(options.bg_model_filename))
        converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences]
    else:
        logging.info("Learning new background model")
        bg_model, converted_seqs = learn_bg_model(
          sequences,
          num_mosaics=options.bg_model_num_mosaics,
          order=options.bg_model_order
        )
        if options.bg_model_filename:
            logging.info("Saving background model to %s", options.bg_model_filename)
            cPickle.dump(bg_model, open(options.bg_model_filename, 'w'))

    if options.force_seed:
        logging.info('Forcing seed to be: %s', options.force_seed)
        L_mers = [(seq_to_numpy(options.force_seed), len(sequences), len(sequences))]
    else:
        # Calculate log likelihood of L-mers under background model.
        bg_L_mer_scores = calculate_k_mer_scores(bg_model, converted_seqs, options.L)

        # Find best candidate L-mers
        distance = K_mer_distance(allowed_shifts=options.allowed_shifts, shift_cost=options.shift_cost)
        L_mer_seeds = list()
        gap_end_offset = options.L/5 + 1
        start = time.time()
        num_L_mers_to_find = 3 * options.max_L_mers_to_evaluate
        logging.info('Finding best %d candidate %d-mers to seed HMM emissions', num_L_mers_to_find, options.L)
        L_mers = hmm.top_mers_by_sequence_membership(
          preprocessed_sequences,
          k=options.L,
          n=num_L_mers_to_find
        )
        logging.info('Finding top %d %d-mers took %f seconds', len(L_mers), options.L, time.time()-start)

    if options.force_gap:
        logging.info('Forcing gap at position: %d', options.force_gap)
        L_mer_seeds = [
          (numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, options.force_gap, 0.0)
          for L_mer, L_mer_count, L_mer_num_seqs in L_mers
        ]
    else:
        # Evaluate L-mers
        if -1 == options.seed_filter_distance:
            min_distance = options.L / 4 + 1
        else:
            min_distance = options.seed_filter_distance
        logging.info('Positioning gaps up to %d bases from end of K-mers', gap_end_offset)
        gap_positions = range(gap_end_offset, options.L+1-gap_end_offset)
        logging.info('Filtering K-mers that are not %d away from previously evaluated.', min_distance)
        logging.info('Evaluating up to %d L-mers.', options.max_L_mers_to_evaluate)
        L_mer_filter = DistanceFilter(distance, min_distance=min_distance)
        discarded = 0
        evaluated = 0
        for L_mer, L_mer_count, L_mer_num_seqs in L_mers:
            if not L_mer_filter(L_mer) or 4 in L_mer:
                logging.debug('Discarding: %s; count: %d; # sequences: %d', numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs)
                discarded += 1
            else:
                score, gap_index = evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options)
                evaluated += 1
                logging.info(
                    'Evaluated (%3d/%d): %s; gap: %d; count: %d; # sequences: %d; score: %f',
                    evaluated, options.max_L_mers_to_evaluate, numpy_to_seq(L_mer), gap_index, L_mer_count, L_mer_num_seqs, score
                )
                L_mer_seeds.append((numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, gap_index, score))
                if len(L_mer_seeds) == options.max_L_mers_to_evaluate:
                    break
        L_mer_seeds.sort(key=lambda x: -x[4]) # sort by score, highest first
        logging.info('Discarded %d L-mers using edit distance', discarded)
        logging.info('Evaluated %d L-mers: scores range from %f to %f', evaluated, L_mer_seeds[-1][4], L_mer_seeds[0][4])
    return L_mer_seeds, bg_model