def create_data(*seqs): string_set = stempy.StringSet() for seq in seqs: logging.info('Adding sequence "%s" to string set.', seq) string_set.append(seq) logging.info('Building index.') index = stempy.build_index(string_set) logging.info('Creating data object.') return stempy.Data(index)
instance_finder.instances.sort() # at least 13 instances in sequences #2012-06-16 11:32:58,686 - INFO - seq= 5; pos= 67; strand=+; W-mer=AACCTCGAGAG; Z=0.749857 #2012-06-16 11:32:58,686 - INFO - seq= 0; pos= 48; strand=+; W-mer=AACCTAAGAAA; Z=0.814953 #2012-06-16 11:32:58,686 - INFO - seq= 3; pos= 51; strand=+; W-mer=AAACTGTGGCT; Z=0.819370 #2012-06-16 11:32:58,686 - INFO - seq= 5; pos= 79; strand=+; W-mer=AAGCTAAAGAG; Z=0.827948 #2012-06-16 11:32:58,687 - INFO - seq= 3; pos= 36; strand=-; W-mer=AAGCTTATCAG; Z=0.862206 #2012-06-16 11:32:58,687 - INFO - seq= 5; pos= 97; strand=-; W-mer=GAACTGGGGAT; Z=0.912242 #2012-06-16 11:32:58,687 - INFO - seq= 2; pos= 47; strand=+; W-mer=AAACTTGGGAA; Z=0.919355 #2012-06-16 11:32:58,687 - INFO - seq= 1; pos= 6; strand=+; W-mer=AACCTTAGACG; Z=0.963969 #2012-06-16 11:32:58,687 - INFO - seq= 6; pos= 46; strand=-; W-mer=AAGCTGGGGAC; Z=0.968255 #2012-06-16 11:32:58,687 - INFO - seq= 9; pos= 73; strand=-; W-mer=GACCTGATGAG; Z=0.968813 #2012-06-16 11:32:58,687 - INFO - seq= 5; pos= 16; strand=-; W-mer=AACCTGAGCCG; Z=0.974733 #2012-06-16 11:32:58,687 - INFO - seq= 6; pos= 73; strand=+; W-mer=AACCTTAGGCG; Z=0.984093 #2012-06-16 11:32:58,687 - INFO - seq= 3; pos= 10; strand=+; W-mer=AACCTTAGGAT; Z=0.984245 # # Print the instances # for instance in instance_finder.instances: seq, pos = data.pos_localise(instance.global_pos) W_mer = data.get_W_mer(W, instance.global_pos) if instance.rev_comp: W_mer = stempy.reverse_complement(W_mer) logging.info('seq=%5d; pos=%6d; strand=%s; W-mer=%s; Z=%4f', seq, pos, instance.rev_comp and '-' or '+', W_mer, instance.Z) logging.info('Found %d instances', len(instance_finder.instances)) assert 13 == len(instance_finder.instances)
freqs_with_pseudo_counts = freqs.add_pseudo_counts(options.back_dist_prior) lls = mm.calculate_likelihoods(data) bg_model = stempy.create_bg_model_from_base_likelihoods(W, data, lls, freqs_with_pseudo_counts) # binding site model bs_model = stempy.PssmBindingSiteModel(stempy.initialise_uniform_pssm(W, options.alphabet_size)) bs_model.seed(seed) # whole model model = stempy.Model(data, bs_model, bg_model, _lambda=0.) Z_threshold = .3 with Timer(msg='find instances with Z>%f' % Z_threshold): instance_finder = stempy.FindInstances(data, model, Z_threshold) instance_finder() logging.info('Found %d instances', len(instance_finder.instances)) num_W_mers_to_find = 10000 with Timer(msg='find %d best W-mers' % num_W_mers_to_find): w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_W_mers_to_find) w_mer_finder() logging.info('Found %d instances', len(w_mer_finder.best_w_mers)) def global_overlap(pos1, pos2, W): return abs(pos1 - pos2) < W def get_non_overlapping(instances, W): instances.sort() instances.reverse()
for seq in seqs: logging.info('Adding sequence "%s" to string set.', seq) string_set.append(seq) logging.info('Building index.') index = stempy.build_index(string_set) logging.info('Creating data object.') return stempy.Data(index) def feq(x, y, eps=1e-4): return fabs(x - y < eps) seq = "ACGTACACAC" data = create_data(seq) logging.info('Creating Markov model.') mm, freqs = stempy.create_markov_model_order_3(data, 1.) logging.info('Calculating likelihoods.') lls = mm.calculate_likelihoods(data) base_probs = map(exp, base_lls(lls[0])) logging.info(', '.join('%.5f' % p for p in base_probs)) assert feq(stempy.W_mer_log_likelihood(lls[0], 0, 1), (seq.count('A')+1.)/(len(seq)+4.)) assert feq(base_probs[1], (seq.count('AC')+1.)/(seq.count('A')+4.)) assert feq(base_probs[2], (seq.count('ACG')+1.)/(seq.count('AC')-1.+4.)) # -1. because last 'AC' has no following character assert feq(base_probs[3], (seq.count('ACGT')+1.)/(seq.count('ACG')+4.)) assert feq(base_probs[4], (seq.count('CGTA')+1.)/(seq.count('CGT')+4.)) # check the bg model from likelihoods bg_model = stempy.create_bg_model_from_base_likelihoods(4, data, lls, freqs) seq = "AAAC"
init_test_env(__file__, level=logging.INFO) import os, stempy # # First run STEME # options = stempy.get_default_options() options.output_dir = os.path.join('output', 'test-meme-like-output') options.min_w = options.max_w = 8 options.meme_like_output = 'meme.out' algorithm = stempy.Algorithm(options) fasta = os.path.join(os.path.dirname(__file__), 'fasta', 'T00759-tiny.fa') algorithm(fasta) logging.info('Showing MEME output from %s', algorithm.meme_like_output_file) os.system('cat %s' % algorithm.meme_like_output_file) # # Test BioPython parser # from Bio import Motif motifs = list(Motif.parse(open(algorithm.meme_like_output_file), "MEME")) # # Doesn't quite work with pycogent yet. Pycogent expects a summary section # that contains sites in all the sequences # #from cogent import LoadSeqs
fasta_file = os.path.join(fasta_dir(), 'random-seqs-4-sites.fasta') options = stempy.get_default_options() options.output_dir = os.path.join('output', 'test-num-sites') options.bg_model_order = 0 options.min_w = options.max_w = W options.min_num_sites = 2 options.max_num_sites = 10 options.meme_like_output = 'test-num-sites.txt' meme_output = os.path.join(options.output_dir, options.meme_like_output) # # Run the STEME algorithm # algorithm = stempy.Algorithm(options) algorithm(fasta_file) # # Make sure we choose a motif that predicted 4 sites # predicted_sites = parse_meme_output_for_sites(meme_output) for seq, sites in predicted_sites.iteritems(): for i, _id in enumerate(algorithm.input_sequences.ids): if _id.startswith(seq): break for site in sites: global_pos = algorithm.input_sequences.data.pos_globalise(i, site.first) logging.info('%2d %3d %s %s', i, site.first, algorithm.input_sequences.data.get_W_mer(W, global_pos), seq) assert 4 == len(predicted_sites)
import pkg_resources from optparse import OptionParser from stempy.scan import load_occurrences_from_stream from stempy.spacing import add_max_distance_option, count_all_pairs, spacing_idx parser = OptionParser() options = parser.get_default_values() options.max_distance = 4 # # Load the occurrences and associated sequence lengths, # they will come sorted by position # logging.info('Loading occurrences') occurrences, seq_infos, motifs = load_occurrences_from_stream( pkg_resources.resource_stream('stempy', 'test/spacing/steme-pwm-scan.out'), pkg_resources.resource_stream('stempy', 'test/spacing/steme-pwm-scan.seqs'), ) # # Iterate through the occurrences counting spacings # logging.info( 'Examining spacings of up to %d b.p. between %d occurrences of %d motifs in %d sequences', options.max_distance, len(occurrences), len(motifs), len(seq_infos) ) spacings = count_all_pairs(occurrences, seq_infos, ignore_close_to_end=True, options=options)
# input_sequences = stempy.SequenceSet(fasta.encode(sys.stdin.encoding or 'ascii'), options) # # Initialise the background # bg_manager = stempy.get_background_manager(input_sequences, input_sequences.mm, options) # # Create the model # model = input_sequences.create_model(bg_manager.get_bg_model(W), W) model.bs.pssm.log_probs.values()[:] = log_pwm model.bs.recalculate() model.lambda_ = lambda_ # # Create the instance finder and find the instances # instance_finder = stempy.FindInstances(input_sequences.data, model, Z_threshold) instance_finder() instance_finder.instances.sort() logging.info('Found %d instances', len(instance_finder.instances))
init_test_env(__file__, level=logging.INFO) import stempy, os from stempy.planted_sites import parse_fasta_for_sites, parse_meme_output_for_sites, calculate_positives_and_negatives from infpy.roc import RocCalculator from optparse import OptionParser parser = OptionParser() parser.add_option("--run-meme", action='store_true', help="Run MEME as well on the sequences") cmd_line_options, args = parser.parse_args() # # The data sets with minimum sensitivity and specificity values required. # if is_debug_python(): logging.info('Detected debug version of Python, only using smallest data set.') data_sets = [ #('random-seqs-03-050' , .13, .86), ('random-seqs-with-Ns-03-050', .13, .86), ] else: data_sets = [ ('random-seqs-03-050' , .13, .86), ('random-seqs-with-Ns-03-050', .13, .86), ('random-seqs-05-100' , .40, .89), ('random-seqs-with-Ns-05-100', .60, .90), ('random-seqs-05-100' , .40, .89), # cannot achieve (.6,.9) stats when finding starts up-front ('random-seqs-10-100' , .60, .91), ('random-seqs-with-Ns-10-100', .20, .98), # lower specificity with Ns ('random-seqs-30-200' , .46, .99), ('random-seqs-with-Ns-30-200', .46, .99),