Exemplo n.º 1
0
def evaluate_mosaics(max_mosaics=6, max_order=3):
    """
    Evaluate different mosaic models on chip-chip fragments.
    """
    from gapped_pssms import data
    sequences = data.training_test_sequences()
    mosaic_sizes = range(1, max_mosaics + 1)
    orders = range(max_order + 1)
    preprocessed_sequences = [([hmm.preprocess_sequence(s) for s in training],
                               [hmm.preprocess_sequence(s) for s in test])
                              for training, test in sequences]
    result = list()
    for order in orders:
        converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order)
        order_n_seqs = [([converter.to_order_n(s) for s in training],
                         [converter.to_order_n(s) for s in test])
                        for training, test in sequences]
        for num_mosaics in mosaic_sizes:
            LL = 0.
            for training_seqs, test_seqs in order_n_seqs:
                model = hmm.as_model(
                    create_mosaic_model(num_mosaics=num_mosaics,
                                        p_transition=0.,
                                        alphabet_size=4,
                                        order=order,
                                        dirichlet_prior_strength=10.))
                model.baum_welch(training_seqs)
                LL += sum(model.LL(s) for s in test_seqs)
            logging.info('Order: %d; # mosaics: %d; LL: %f', order,
                         num_mosaics, LL)
            result.append((order, num_mosaics, LL))
    return result
Exemplo n.º 2
0
def load_seqs(filename):
    "Load and convert sequences from fasta file."
    logging.info('Loading sequences: %s', filename)
    sequences = dict(sequences_from_fasta(filename))
    numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems())
    tally = sum(imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems())))
    logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values()))))
    return numpy_seqs, tally
Exemplo n.º 3
0
def evaluate_mosaics(max_mosaics=6, max_order=3):
    """
    Evaluate different mosaic models on chip-chip fragments.
    """
    from gapped_pssms import data
    sequences = data.training_test_sequences()
    mosaic_sizes = range(1,max_mosaics+1)
    orders = range(max_order+1)
    preprocessed_sequences = [
      (
        [hmm.preprocess_sequence(s) for s in training],
        [hmm.preprocess_sequence(s) for s in test]
      )
      for training, test in sequences
    ]
    result = list()
    for order in orders:
        converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order)
        order_n_seqs = [
          (
            [converter.to_order_n(s) for s in training],
            [converter.to_order_n(s) for s in test]
          )
          for training, test in sequences
        ]
        for num_mosaics in mosaic_sizes:
            LL = 0.
            for training_seqs, test_seqs in order_n_seqs:
                model = hmm.as_model(
                  create_mosaic_model(
                    num_mosaics=num_mosaics,
                    p_transition=0.,
                    alphabet_size=4,
                    order=order,
                    dirichlet_prior_strength=10.
                  )
                )
                model.baum_welch(training_seqs)
                LL += sum(model.LL(s) for s in test_seqs)
            logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL)
            result.append((order, num_mosaics, LL))
    return result
Exemplo n.º 4
0
def load_seqs(filename):
    "Load and convert sequences from fasta file."
    logging.info('Loading sequences: %s', filename)
    sequences = dict(sequences_from_fasta(filename))
    numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq)))
                      for desc, (seq, tally) in sequences.iteritems())
    tally = sum(
        imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems())))
    logging.info('Loaded %d sequences with %d bases', len(sequences),
                 sum(imap(len, (seq for seq, tally in sequences.values()))))
    return numpy_seqs, tally
Exemplo n.º 5
0
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
    ])
    logging.info('Calculating log scores')
    pssm_scores = hmm.calculate_log_scores(nucleo_dists)
    logging.info('Calculating complementary scores')
    comp_scores = hmm.calculate_complementary_scores(pssm_scores)
    logging.info('Preprocessing sequence')
    seq = hmm.preprocess_sequence(
        N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores,
Exemplo n.º 6
0
    def __call__(self, sequences, bg_model=None):
        """
        Run the motif finding algorithm.
        """
        logging.info('Looking for at least %d PSSMs', self.options.num_pssms)

        if self.options.max_L_mers_to_evaluate < self.options.num_pssms:
            raise ValueError('Cannot find any more PSSMs than L-mers are evaluated.')

        num_bases = sum(len(s) for s in sequences)
        logging.info('Running single gap algorithm on %d sequences with %d bases', len(sequences), num_bases)

        preprocessed_sequences = [hmm.preprocess_sequence(s) for s in sequences]
        start = time.clock()
        seeds, bg_model = generate_seeds(
          sequences,
          preprocessed_sequences,
          self.options
        )
        logging.info('Generating %d seeds took %.1f seconds', len(seeds), time.clock() - start)

        # try the best few seeds
        start = time.clock()
        num_to_examine = self.options.num_seeds_to_examine and self.options.num_seeds_to_examine or 2 * self.options.num_pssms
        logging.info('Examining %d/%d seeds', num_to_examine, len(seeds))
        p_one_per_seq = len(preprocessed_sequences) / float(num_bases) # expect one per sequence
        self.p_binding_site = p_one_per_seq * self.options.p_binding_site_scale
        logging.info(
            'HMM p(binding site) parameter estimated as %f (1 site/seq) and adjusted to %f by scaling parameter',
            p_one_per_seq,
            self.p_binding_site,
        )
        results = list(
          (seed, self.try_seed(seed, len(sequences), num_bases, preprocessed_sequences))
          for seed, i in zip(seeds, xrange(num_to_examine))
        )

        # keep only those results that succeeded
        results = filter(lambda x: x[1], results)
        logging.info('Got PSSMs for %d seeds in %.1f seconds', len(results), time.clock() - start)

        # define a function that scores results
        def score_result(result):
            seed, result = result
            L_mer, count, num_seqs, gap, score = seed
            model, builder, num_sites, num_seqs_with_site = result
            emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1)
            return geometric_mean((
              calculate_first_order_entropy_score(emissions),
              calculate_information_content_score(emissions),
              num_seqs_with_site / float(len(sequences))
            ))

        # sort by scores
        scored_results = [(score_result(result), result) for result in results]
        scored_results.sort(reverse=True)

        # remove those PSSMs that do not score highly enough
        logging.info('Removing PSSMs with low scores.')
        while len(scored_results) > self.options.num_pssms and scored_results[-1][0] < self.options.pssm_score_threshold:
            scored_results.pop()
        logging.info('%d PSSMs scored highly enough', len(scored_results))

        # examine results
        for i, (score, (seed, result)) in enumerate(scored_results):
            logging.info('************** PSSM %d **************', i)
            L_mer, count, num_seqs, gap, seed_score = seed
            model, builder, num_sites, num_seqs_with_site = result
            logging.info(
              'Seed %s with gap at %d had %d hits in %d/%d sequences',
              L_mer,
              gap,
              count,
              num_seqs,
              len(sequences)
            )
            logging.info('Seed score: %f', seed_score)
            image_file = os.path.join(self.options.output_dir, '%s-%03d' % (self.options.tag, i))
            pssm_def_file = os.path.join(self.options.output_dir, '%s-%03d.pssm' % (self.options.tag, i))
            logging.info(
              'HMM found %d sites. %d/%d sequences have at least one site',
              num_sites,
              num_seqs_with_site,
              len(sequences)
            )
            self.examine_model(model, builder, sequences, image_file, pssm_def_file)
            logging.info('Score: %g', score)
            emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1)

        return [r[1] for r in results]
Exemplo n.º 7
0
AATACTATTACTATACCCACGACCTCCAGAAATTCACTGGATAACCAGTAAGACAACTTCTACTCATTTCTTCATATTCC
TACTTATTCAAGTTGTAGCCTTCATAGTTGATAAAAAATCAGCACACATTAAGAAAACAATAACAGAACTATTTTCTTCA
CATGACTTTTATTCCTTAATCCAGACTGTTAAAAGGACTGCAAGACAAATTGTTTTTCAATCAGATTTTTTTCTCCACCA
GATGTCTATGTGAATTTCATATTGTTTTAGACAAAAATGCTCATTCCTTCGGTCTAAGTACTATGTCATATTTTGTTTTT
TCAAGCCTTCAAATTTTGTGCTGGTGGTTACTTCATATACATTCTATGGTTAATCTTTAAAGAGAAGTTTTAAAAGTCTG
ATTCAAAATTTCAGTTCACTCGCTATGTATTTTAAAAATTAAAATTTATGAAATTCAATTTTAAAAATCTAAAAGTTATC
TAAAAAGGTCTATGACTTATCAAATTTCAATAAGCTGACTGTTAGCAGTATTAAAAAATATTAAATATGCTAACANNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATACATAAAGGGAATAGGCAGAGTTCACAGATT
AATATTTCTTACCTCTACAATAAGAAGAAATACCTTGTTCTATGAGCAGCTGCCATACTTTCAGACATGTTTCTGACTTT
TAGATAATTAACAAATCCTCTGAAGAAAAGGAGCAGGCCTGAGAAGGTTGAAATAATATGGATATACTATGTTTTTATAC
AGAAAAGGGCAAGATAAATTTAAAGTAGACAATTATAAACANNNNNNNNNNNNNNNNNGGA""".replace('\n', '')

def convert_seq(seq):
    return numpy.array(corebio.seq.Seq(seq, alphabet=corebio.seq.reduced_nucleic_alphabet).ords())

old_pp = hmm.preprocess_sequence(convert_seq(old_seq))
new_pp = hmm.preprocess_sequence(convert_seq(new_seq))

#meme_dir = '/home/reid/Analysis/GappedPssms/MEME/x-validate'
#pssm_file = os.path.join(meme_dir, 'T00671-1.pssm')
pssm_file = '/home/john/Analysis/GappedPssms/MEME/x-validate/vm-T00671-motif-h2-v9-x1.pssm'
semi_parsed_models = list(parse_models(open(pssm_file)))
if len(semi_parsed_models) > 1:
    print >> sys.stderr, 'For the moment we can only handle one model at a time.'
    sys.exit(-1)
parsed = semi_parsed_models[0]
logging.info(str(parsed))
model, traits = build_hmm_from_semi_parsed(parsed)
classifier = make_classifier(model)

def test_seq(seq):
Exemplo n.º 8
0
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
        [.85, .05, .05, .05],
        [.05, .85, .05, .05],
      ]
    )
    logging.info('Calculating log scores')
    pssm_scores = hmm.calculate_log_scores(nucleo_dists)
    logging.info('Calculating complementary scores')
    comp_scores = hmm.calculate_complementary_scores(pssm_scores)
    logging.info('Preprocessing sequence')
    seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4]))

    logging.info('Scoring sequence')
    logging.info(hmm.score_sequence(pssm_scores, seq))
    logging.info('Scoring sequence with complementary scores')
    logging.info(hmm.score_sequence(comp_scores, seq))

    logging.info(hmm.max_score_in_sequence(pssm_scores, seq))
    logging.info(hmm.max_score_in_sequence(comp_scores, seq))

    long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)]
    long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)

    logging.info('Starting to time max scores on long sequences.')
    start = time.time()
    max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed)