def run(options): # # Create HDF5 table # h5file = tables.openFile(filename, mode="a", title="Eta stability data") try: timings_table = h5file.root.EtaStability.timings except tables.NoSuchNodeError: group = h5file.createGroup( "/", 'EtaStability', 'Data about stability of epsilon') # Create a new group under "/" (root) # Create one table on it timings_table = h5file.createTable( group, 'timings', Timings, "Timings table") # # Examine data sets # data_sets = test_data.data_sets_for_options(options) fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets] for ds, fasta in zip(data_sets, fasta_filenames): num_seqs, num_bases = test_data.get_data_set_size(ds) logging.info( 'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta) # # set up parallel stuff # from IPython.kernel import client tc = client.TaskClient() # # pass tasks to engines # logging.info('Passing tasks to engines') task_ids = [] task_args = dict() task_data_set = dict() for data_set, fasta in zip(data_sets, fasta_filenames): for seed, num_sites, score in test_data.starts[data_set]: options.max_num_sites = num_sites for epsilon in epsilons: # only pass task if we don't have data in the table already where_list = timings_table.getWhereList( '(dataset=="%s") & (seed=="%s") & (nsites==%d) & (epsilon>%f) & (epsilon<%f)' % ( data_set, seed, math.trunc( num_sites), epsilon - 1e-4, epsilon + 1e-4 ) ) if options.force or 0 == len(where_list): # remove data if we already have it and are forcing new # calculation if len(where_list): if 1 != len(where_list): raise ValueError( 'Expecting to just find one existing row for this start.') timings_table.removeRows(where_list[0]) options.output_dir = os.path.abspath( os.path.join('output', 'epsilon-stability', '%s-%03d-%.1f' % (seed, num_sites, epsilon))) os.path.exists(options.output_dir) or os.makedirs( options.output_dir) args = (fasta, seed, num_sites, epsilon, options) # print fasta, seed, num_sites, score, epsilon, options task = client.MapTask(test_stability_and_speed, ( fasta, seed, num_sites, score, epsilon, options)) task_id = tc.run(task, block=False) task_ids.append(task_id) task_data_set[task_id] = data_set task_args[task_id] = args # # Get results from engines # logging.info('Blocking on %d results...', len(task_ids)) timings = timings_table.row # Fill the table with data for task_id in task_ids: duration, post_EM_consensus, num_iters = tc.get_task_result( task_id, block=True) fasta, seed, num_sites, epsilon, options = task_args[task_id] data_set = task_data_set[task_id] timings['dataset'] = data_set timings['seed'] = seed timings['nsites'] = num_sites timings['epsilon'] = epsilon timings['niters'] = num_iters timings['duration'] = duration timings['consensus'] = post_EM_consensus logging.info( '%20s; nsites=%3d; epsilon=%.1f; iters=%5d; elapsed=%7.1fs; per iteration=%6.2fs; %20s; %s', seed, num_sites, epsilon, num_iters, duration, duration / num_iters, post_EM_consensus, data_set ) timings.append() h5file.close() # Close (and flush) the HDF5 file
def run(options): # # Create HDF5 table # h5file = tables.openFile(filename, mode="a", title="STEM/MEME data") try: meme_em_table = h5file.root.MEME.starts except tables.NoSuchNodeError: meme_group = h5file.createGroup( "/", 'MEME', 'Data about MEME runs') # Create a new group under "/" (root) # Create one table on it meme_em_table = h5file.createTable( meme_group, 'starts', MemeEM, "Info on MEME starts.") # # Examine data sets # data_sets = test_data.data_sets_for_options(options) fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets] for ds, fasta in zip(data_sets, fasta_filenames): num_seqs, num_bases = test_data.get_data_set_size(ds) logging.info( 'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta) # # set up parallel stuff # from IPython.kernel import client tc = client.TaskClient() # # pass tasks to engines # logging.info('Passing tasks to engines') task_ids = [] task_args = dict() task_data_set = dict() for data_set, fasta in zip(data_sets, fasta_filenames): for seed, num_sites, score in test_data.starts[data_set]: # only pass task if we don't have data in the table already if 0 == len( meme_em_table.getWhereList( '(dataset=="%s") & (cons0=="%s") & (nsites0==%d)' % ( data_set, seed, math.trunc(num_sites) ) ) ): options.output_dir = os.path.abspath( os.path.join('output', 'meme-em', '%s-%03d' % (seed, num_sites))) os.path.exists(options.output_dir) or os.makedirs( options.output_dir) args = (fasta, seed, num_sites, options) task = client.MapTask( run_meme_on_start, (fasta, seed, num_sites, score, options)) task_id = tc.run(task, block=False) task_ids.append(task_id) task_data_set[task_id] = data_set task_args[task_id] = args # # Get results from engines # logging.info('Blocking on %d results...', len(task_ids)) meme_em = meme_em_table.row # Fill the table with data for task_id in task_ids: start = tc.get_task_result(task_id, block=True) fasta, seed, num_sites, options = task_args[task_id] assert seed == start.cons0 data_set = task_data_set[task_id] meme_em['dataset'] = data_set meme_em['cons0'] = start.cons0 meme_em['nsites0'] = start.nsites0 meme_em['niters'] = start.niters meme_em['em_time'] = start.em_time meme_em['cons'] = start.cons_after_em meme_em['nsites'] = start.nsites meme_em['sig'] = start.sig logging.info( '%s: cons0=%20s; nsites0=%3d; niters=%4d; elapsed=%7.1fs; per iteration=%6.2fs; cons=%20s; nsites0=%3d; sig=%e', data_set, start.cons0, start.nsites0, start.niters, start.em_time, start.em_time / start.niters, start.cons, start.nsites, start.sig ) meme_em.append() h5file.close() # Close (and flush) the HDF5 file
def hamming_distance(s1, s2): "@return: The Hamming distance between s1 and s2." assert len(s1) == len(s2) return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) def output_filename(name): return os.path.join(output_dir, name) parser = OptionParser() test_data.add_options(parser) options, args = parser.parse_args() if len(args) < 1: raise RuntimeError('USAGE: %s <h5 file>' % sys.argv[0]) data_sets = test_data.data_sets_for_options(options) h5_filename = args[0] if not data_sets: raise ValueError('No data sets specified in options.') output_dir = os.path.join('output', 'STEM-vs-MEME') os.path.exists(output_dir) or os.makedirs(output_dir) h5file = tables.openFile(h5_filename) # # Get HDF5 tables # meme_em_table = h5file.root.MEME.starts stem_em_table = h5file.root.EtaStability.timings