def run(options): # # Create HDF5 table # h5file = tables.openFile(filename, mode="a", title="Eta stability data") try: timings_table = h5file.root.EtaStability.timings except tables.NoSuchNodeError: group = h5file.createGroup( "/", 'EtaStability', 'Data about stability of epsilon') # Create a new group under "/" (root) # Create one table on it timings_table = h5file.createTable( group, 'timings', Timings, "Timings table") # # Examine data sets # data_sets = test_data.data_sets_for_options(options) fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets] for ds, fasta in zip(data_sets, fasta_filenames): num_seqs, num_bases = test_data.get_data_set_size(ds) logging.info( 'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta) # # set up parallel stuff # from IPython.kernel import client tc = client.TaskClient() # # pass tasks to engines # logging.info('Passing tasks to engines') task_ids = [] task_args = dict() task_data_set = dict() for data_set, fasta in zip(data_sets, fasta_filenames): for seed, num_sites, score in test_data.starts[data_set]: options.max_num_sites = num_sites for epsilon in epsilons: # only pass task if we don't have data in the table already where_list = timings_table.getWhereList( '(dataset=="%s") & (seed=="%s") & (nsites==%d) & (epsilon>%f) & (epsilon<%f)' % ( data_set, seed, math.trunc( num_sites), epsilon - 1e-4, epsilon + 1e-4 ) ) if options.force or 0 == len(where_list): # remove data if we already have it and are forcing new # calculation if len(where_list): if 1 != len(where_list): raise ValueError( 'Expecting to just find one existing row for this start.') timings_table.removeRows(where_list[0]) options.output_dir = os.path.abspath( os.path.join('output', 'epsilon-stability', '%s-%03d-%.1f' % (seed, num_sites, epsilon))) os.path.exists(options.output_dir) or os.makedirs( options.output_dir) args = (fasta, seed, num_sites, epsilon, options) # print fasta, seed, num_sites, score, epsilon, options task = client.MapTask(test_stability_and_speed, ( fasta, seed, num_sites, score, epsilon, options)) task_id = tc.run(task, block=False) task_ids.append(task_id) task_data_set[task_id] = data_set task_args[task_id] = args # # Get results from engines # logging.info('Blocking on %d results...', len(task_ids)) timings = timings_table.row # Fill the table with data for task_id in task_ids: duration, post_EM_consensus, num_iters = tc.get_task_result( task_id, block=True) fasta, seed, num_sites, epsilon, options = task_args[task_id] data_set = task_data_set[task_id] timings['dataset'] = data_set timings['seed'] = seed timings['nsites'] = num_sites timings['epsilon'] = epsilon timings['niters'] = num_iters timings['duration'] = duration timings['consensus'] = post_EM_consensus logging.info( '%20s; nsites=%3d; epsilon=%.1f; iters=%5d; elapsed=%7.1fs; per iteration=%6.2fs; %20s; %s', seed, num_sites, epsilon, num_iters, duration, duration / num_iters, post_EM_consensus, data_set ) timings.append() h5file.close() # Close (and flush) the HDF5 file
def run(options): # # Create HDF5 table # h5file = tables.openFile(filename, mode="a", title="STEM/MEME data") try: meme_em_table = h5file.root.MEME.starts except tables.NoSuchNodeError: meme_group = h5file.createGroup( "/", 'MEME', 'Data about MEME runs') # Create a new group under "/" (root) # Create one table on it meme_em_table = h5file.createTable( meme_group, 'starts', MemeEM, "Info on MEME starts.") # # Examine data sets # data_sets = test_data.data_sets_for_options(options) fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets] for ds, fasta in zip(data_sets, fasta_filenames): num_seqs, num_bases = test_data.get_data_set_size(ds) logging.info( 'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta) # # set up parallel stuff # from IPython.kernel import client tc = client.TaskClient() # # pass tasks to engines # logging.info('Passing tasks to engines') task_ids = [] task_args = dict() task_data_set = dict() for data_set, fasta in zip(data_sets, fasta_filenames): for seed, num_sites, score in test_data.starts[data_set]: # only pass task if we don't have data in the table already if 0 == len( meme_em_table.getWhereList( '(dataset=="%s") & (cons0=="%s") & (nsites0==%d)' % ( data_set, seed, math.trunc(num_sites) ) ) ): options.output_dir = os.path.abspath( os.path.join('output', 'meme-em', '%s-%03d' % (seed, num_sites))) os.path.exists(options.output_dir) or os.makedirs( options.output_dir) args = (fasta, seed, num_sites, options) task = client.MapTask( run_meme_on_start, (fasta, seed, num_sites, score, options)) task_id = tc.run(task, block=False) task_ids.append(task_id) task_data_set[task_id] = data_set task_args[task_id] = args # # Get results from engines # logging.info('Blocking on %d results...', len(task_ids)) meme_em = meme_em_table.row # Fill the table with data for task_id in task_ids: start = tc.get_task_result(task_id, block=True) fasta, seed, num_sites, options = task_args[task_id] assert seed == start.cons0 data_set = task_data_set[task_id] meme_em['dataset'] = data_set meme_em['cons0'] = start.cons0 meme_em['nsites0'] = start.nsites0 meme_em['niters'] = start.niters meme_em['em_time'] = start.em_time meme_em['cons'] = start.cons_after_em meme_em['nsites'] = start.nsites meme_em['sig'] = start.sig logging.info( '%s: cons0=%20s; nsites0=%3d; niters=%4d; elapsed=%7.1fs; per iteration=%6.2fs; cons=%20s; nsites0=%3d; sig=%e', data_set, start.cons0, start.nsites0, start.niters, start.em_time, start.em_time / start.niters, start.cons, start.nsites, start.sig ) meme_em.append() h5file.close() # Close (and flush) the HDF5 file
epsilon_hamming[epsilon_index].append(hamming) epsilon_fraction_mismatch[epsilon_index].append( hamming / float(len(meme_row['cons']))) epsilon_mismatches[epsilon_index].append(hamming > 0) rel_speed = stem_row['duration'] / meme_row['em_time'] iter_rel_speed = rel_speed / \ stem_row['niters'] * meme_row['niters'] stem_num_iters.append(stem_row['niters']) meme_num_iters.append(meme_row['niters']) epsilon_rel_speed[epsilon_index].append(rel_speed) epsilon_iter_rel_speed[epsilon_index].append(iter_rel_speed) if epsilon_index == default_epsilon_index: width_iter_rel_speed[W].append(iter_rel_speed) num_sites_iter_rel_speed[ meme_row['nsites']].append(iter_rel_speed) num_seqs, num_bases = test_data.get_data_set_size(data_set) stem_runtime_by_size[num_bases].append( np.log10(stem_row['duration'])) meme_runtime_by_size[num_bases].append( np.log10(meme_row['em_time'])) stem_itertime_by_size[num_bases].append( np.log10(stem_row['duration']) - np.log10(stem_row['niters'])) meme_itertime_by_size[num_bases].append( np.log10(meme_row['em_time']) - np.log10(meme_row['niters'])) epsilon_indices = epsilon_mismatches.keys() epsilon_indices.sort() epsilon_range = np.arange(len(epsilon_indices)) epsilons = map(index_to_epsilon, epsilon_indices) str_epsilons = ['%.1f' % e for e in epsilons]