Пример #1
0
#
# Copyright John Reid 2008
#

"""
Code to test count k-mer functionality.
"""

if '__main__' == __name__:
    import hmm
    from gapped_pssms.sequence import seq_to_numpy


    sequences = [
      'aacc',
      'ggtt',
      'aaaaaaaaaaaa',
    ]
    seqs = map(seq_to_numpy, sequences)
    print 'Sequences:'
    print '\n'.join(str(s) for s in seqs)
    print

    top_k_mers = hmm.top_mers_by_sequence_membership(
      seqs,
      k=3,
      n=10
    )
    print '\n'.join(str(o) for o in top_k_mers)
Пример #2
0
def generate_seeds(
  sequences,
  preprocessed_sequences,
  options
):
    """
    Generate a list of candidate L-mers and score them to find the best seed L-mer and gap position.
    """
    # if we have been given a background model filename and it exists then load it.
    if None != options.bg_model_filename and os.path.exists(options.bg_model_filename):
        logging.info("Loading supplied background model from %s", options.bg_model_filename)
        bg_model = cPickle.load(open(options.bg_model_filename))
        converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences]
    else:
        logging.info("Learning new background model")
        bg_model, converted_seqs = learn_bg_model(
          sequences,
          num_mosaics=options.bg_model_num_mosaics,
          order=options.bg_model_order
        )
        if options.bg_model_filename:
            logging.info("Saving background model to %s", options.bg_model_filename)
            cPickle.dump(bg_model, open(options.bg_model_filename, 'w'))

    if options.force_seed:
        logging.info('Forcing seed to be: %s', options.force_seed)
        L_mers = [(seq_to_numpy(options.force_seed), len(sequences), len(sequences))]
    else:
        # Calculate log likelihood of L-mers under background model.
        bg_L_mer_scores = calculate_k_mer_scores(bg_model, converted_seqs, options.L)

        # Find best candidate L-mers
        distance = K_mer_distance(allowed_shifts=options.allowed_shifts, shift_cost=options.shift_cost)
        L_mer_seeds = list()
        gap_end_offset = options.L/5 + 1
        start = time.time()
        num_L_mers_to_find = 3 * options.max_L_mers_to_evaluate
        logging.info('Finding best %d candidate %d-mers to seed HMM emissions', num_L_mers_to_find, options.L)
        L_mers = hmm.top_mers_by_sequence_membership(
          preprocessed_sequences,
          k=options.L,
          n=num_L_mers_to_find
        )
        logging.info('Finding top %d %d-mers took %f seconds', len(L_mers), options.L, time.time()-start)

    if options.force_gap:
        logging.info('Forcing gap at position: %d', options.force_gap)
        L_mer_seeds = [
          (numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, options.force_gap, 0.0)
          for L_mer, L_mer_count, L_mer_num_seqs in L_mers
        ]
    else:
        # Evaluate L-mers
        if -1 == options.seed_filter_distance:
            min_distance = options.L / 4 + 1
        else:
            min_distance = options.seed_filter_distance
        logging.info('Positioning gaps up to %d bases from end of K-mers', gap_end_offset)
        gap_positions = range(gap_end_offset, options.L+1-gap_end_offset)
        logging.info('Filtering K-mers that are not %d away from previously evaluated.', min_distance)
        logging.info('Evaluating up to %d L-mers.', options.max_L_mers_to_evaluate)
        L_mer_filter = DistanceFilter(distance, min_distance=min_distance)
        discarded = 0
        evaluated = 0
        for L_mer, L_mer_count, L_mer_num_seqs in L_mers:
            if not L_mer_filter(L_mer) or 4 in L_mer:
                logging.debug('Discarding: %s; count: %d; # sequences: %d', numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs)
                discarded += 1
            else:
                score, gap_index = evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options)
                evaluated += 1
                logging.info(
                    'Evaluated (%3d/%d): %s; gap: %d; count: %d; # sequences: %d; score: %f',
                    evaluated, options.max_L_mers_to_evaluate, numpy_to_seq(L_mer), gap_index, L_mer_count, L_mer_num_seqs, score
                )
                L_mer_seeds.append((numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, gap_index, score))
                if len(L_mer_seeds) == options.max_L_mers_to_evaluate:
                    break
        L_mer_seeds.sort(key=lambda x: -x[4]) # sort by score, highest first
        logging.info('Discarded %d L-mers using edit distance', discarded)
        logging.info('Evaluated %d L-mers: scores range from %f to %f', evaluated, L_mer_seeds[-1][4], L_mer_seeds[0][4])
    return L_mer_seeds, bg_model
Пример #3
0
#
# Copyright John Reid 2008
#
"""
Code to test count k-mer functionality.
"""

if '__main__' == __name__:
    import hmm
    from gapped_pssms.sequence import seq_to_numpy

    sequences = [
        'aacc',
        'ggtt',
        'aaaaaaaaaaaa',
    ]
    seqs = map(seq_to_numpy, sequences)
    print 'Sequences:'
    print '\n'.join(str(s) for s in seqs)
    print

    top_k_mers = hmm.top_mers_by_sequence_membership(seqs, k=3, n=10)
    print '\n'.join(str(o) for o in top_k_mers)