def _model_for_L_mer(self, L_mer, gap_index, p_binding_site): """ Create a model initialised by this K-mer. """ # get the start position of the K-mer and a builder to make the model mer_len = len(L_mer) start, builder = self._make_builder(gap_index, mer_len) # get the emission distribution nucleo_dist = nucleo_dist_from_mer( seq_to_numpy(L_mer), self.options.pseudo_count_for_model_initialisation, gap_index=gap_index ) emissions = numpy.ones((self.options.K,4))/4. emissions[start:start+mer_len+1] = nucleo_dist # build the model pssm, in_states, out_states = builder.create( p_gap=.5, emissions=emissions ) model = hmm.as_model( single_gap.add_to_simple_background_model( model=pssm, in_states=in_states, out_states=out_states, p_binding_site=p_binding_site ) ) #print model.A #from IPython.Debugger import Pdb; Pdb().set_trace(); return model
def _distance_to_passed(self, k_mer): return min(self.distance(k_mer, seq_to_numpy(other_k_mer)) for other_k_mer in self.passed)
def most_frequent_k_mers(sequences, K, max_to_return): result = list() for K_mer, count in yield_k_mers(sequences, K): if 0 == max_to_return: break result.append((K_mer, count)) max_to_return -= 1 return result if '__main__' == __name__: # # Test distance metric between K-mers # from gapped_pssms.sequence import seq_to_numpy distance = K_mer_distance() K_mers = [ ('acgtacgtg', 'gacgtacgt', 1), # 1 shift ('acgtacgtg', 'ggacgtacg', 2), # 2 shifts ('ggacgtacg', 'acgtacgtg', 2), # 2 shifts other way ('aaaaaaaaa', 'aaaaaaaaa', 0), # identical ('aaaaaaaaa', 'ttttttttt', 0), # reverse complement ('acgtacgtc', 'aaaaaaaaa', 7), # very different ] for k1, k2, d in K_mers: n1 = seq_to_numpy(k1) n2 = seq_to_numpy(k2) assert d == distance(n1, n2) # make sure is correct distance print '%s - %s : distance=%d' % (k1, k2, d) assert d == distance(n2, n1) # make sure is symmetrical
def generate_seeds( sequences, preprocessed_sequences, options ): """ Generate a list of candidate L-mers and score them to find the best seed L-mer and gap position. """ # if we have been given a background model filename and it exists then load it. if None != options.bg_model_filename and os.path.exists(options.bg_model_filename): logging.info("Loading supplied background model from %s", options.bg_model_filename) bg_model = cPickle.load(open(options.bg_model_filename)) converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences] else: logging.info("Learning new background model") bg_model, converted_seqs = learn_bg_model( sequences, num_mosaics=options.bg_model_num_mosaics, order=options.bg_model_order ) if options.bg_model_filename: logging.info("Saving background model to %s", options.bg_model_filename) cPickle.dump(bg_model, open(options.bg_model_filename, 'w')) if options.force_seed: logging.info('Forcing seed to be: %s', options.force_seed) L_mers = [(seq_to_numpy(options.force_seed), len(sequences), len(sequences))] else: # Calculate log likelihood of L-mers under background model. bg_L_mer_scores = calculate_k_mer_scores(bg_model, converted_seqs, options.L) # Find best candidate L-mers distance = K_mer_distance(allowed_shifts=options.allowed_shifts, shift_cost=options.shift_cost) L_mer_seeds = list() gap_end_offset = options.L/5 + 1 start = time.time() num_L_mers_to_find = 3 * options.max_L_mers_to_evaluate logging.info('Finding best %d candidate %d-mers to seed HMM emissions', num_L_mers_to_find, options.L) L_mers = hmm.top_mers_by_sequence_membership( preprocessed_sequences, k=options.L, n=num_L_mers_to_find ) logging.info('Finding top %d %d-mers took %f seconds', len(L_mers), options.L, time.time()-start) if options.force_gap: logging.info('Forcing gap at position: %d', options.force_gap) L_mer_seeds = [ (numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, options.force_gap, 0.0) for L_mer, L_mer_count, L_mer_num_seqs in L_mers ] else: # Evaluate L-mers if -1 == options.seed_filter_distance: min_distance = options.L / 4 + 1 else: min_distance = options.seed_filter_distance logging.info('Positioning gaps up to %d bases from end of K-mers', gap_end_offset) gap_positions = range(gap_end_offset, options.L+1-gap_end_offset) logging.info('Filtering K-mers that are not %d away from previously evaluated.', min_distance) logging.info('Evaluating up to %d L-mers.', options.max_L_mers_to_evaluate) L_mer_filter = DistanceFilter(distance, min_distance=min_distance) discarded = 0 evaluated = 0 for L_mer, L_mer_count, L_mer_num_seqs in L_mers: if not L_mer_filter(L_mer) or 4 in L_mer: logging.debug('Discarding: %s; count: %d; # sequences: %d', numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs) discarded += 1 else: score, gap_index = evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options) evaluated += 1 logging.info( 'Evaluated (%3d/%d): %s; gap: %d; count: %d; # sequences: %d; score: %f', evaluated, options.max_L_mers_to_evaluate, numpy_to_seq(L_mer), gap_index, L_mer_count, L_mer_num_seqs, score ) L_mer_seeds.append((numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, gap_index, score)) if len(L_mer_seeds) == options.max_L_mers_to_evaluate: break L_mer_seeds.sort(key=lambda x: -x[4]) # sort by score, highest first logging.info('Discarded %d L-mers using edit distance', discarded) logging.info('Evaluated %d L-mers: scores range from %f to %f', evaluated, L_mer_seeds[-1][4], L_mer_seeds[0][4]) return L_mer_seeds, bg_model