def allocate_chunks(results_dir, trial_no, rgs_selected=None, respect_rungroup_barriers=True, runs_selected=None, stripe=False, max_size=1000, integrated=False): refl_ending = "_integrated.pickle" if integrated else "_indexed.pickle" expt_ending = "_refined_experiments.json" trial = "%03d" % trial_no print "processing trial %s" % trial if rgs_selected: rg_condition = lambda rg: rg in rgs_selected else: rg_condition = lambda rg: True rgs = {} # rungroups and associated runs for run in os.listdir(results_dir): if not (run.startswith("r") and run.split("r")[1].isdigit()): continue if runs_selected and run not in runs_selected: continue trgs = [ trg for trg in os.listdir(os.path.join(results_dir, run)) if (trg[:6] == trial + "_rg") and rg_condition(trg[-5:]) ] if not trgs: continue rungroups = set(map(lambda n: n.split("_")[1], trgs)) for rg in rungroups: if rg not in rgs.keys(): rgs[rg] = [run] else: rgs[rg].append(run) batch_chunk_nums_sizes = {} batch_contents = {} if respect_rungroup_barriers: batchable = {rg: {rg: runs} for rg, runs in rgs.iteritems()} else: batchable = {"all": rgs} # for either grouping, iterate over the top level keys in batchable and # distribute the events within those "batches" in stripes or chunks for batch, rungroups in batchable.iteritems(): rg_by_run = {} for rungroup, runs in rungroups.iteritems(): for run in runs: rg_by_run[run] = rungroup n_img = 0 batch_contents[batch] = [] for run, rg in rg_by_run.iteritems(): try: trg = trial + "_" + rg contents = sorted( os.listdir(os.path.join(results_dir, run, trg, "out"))) except OSError: print "skipping run %s missing out directory" % run continue abs_contents = [ os.path.join(results_dir, run, trg, "out", c) for c in contents ] batch_contents[batch].extend(abs_contents) expts = [c for c in contents if c.endswith(expt_ending)] n_img += len(expts) if n_img == 0: print "no images found for %s" % batch del batch_contents[batch] continue n_chunks = int(math.ceil(n_img / max_size)) chunk_size = int(math.ceil(n_img / n_chunks)) batch_chunk_nums_sizes[batch] = (n_chunks, chunk_size) if len(batch_contents) == 0: raise Sorry, "no DIALS integration results found." batch_chunks = {} for batch, num_size_tuple in batch_chunk_nums_sizes.iteritems(): num, size = num_size_tuple batch_chunks[batch] = [] contents = batch_contents[batch] expts = [c for c in contents if c.endswith(expt_ending)] refls = [c for c in contents if c.endswith(refl_ending)] expts, refls = match_dials_files(expts, refls, expt_ending, refl_ending) if stripe: for i in xrange(num): expts_stripe = expts[i::num] refls_stripe = refls[i::num] batch_chunks[batch].append((expts_stripe, refls_stripe)) print "striped %d experiments in %s with %d experiments per stripe and %d stripes" % \ (len(expts), batch, len(batch_chunks[batch][0][0]), len(batch_chunks[batch])) else: for i in xrange(num): expts_chunk = expts[i * size:(i + 1) * size] refls_chunk = refls[i * size:(i + 1) * size] batch_chunks[batch].append((expts_chunk, refls_chunk)) print "chunked %d experiments in %s with %d experiments per chunk and %d chunks" % \ (len(expts), batch, len(batch_chunks[batch][0][0]), len(batch_chunks[batch])) return batch_chunks
def allocate_chunks_per_rungroup(results_dir, trial_no, rgs_selected=None, runs_selected=None, stripe=False, max_size=1000, integrated=False): refl_ending = "_integrated.pickle" if integrated else "_indexed.pickle" expt_ending = "_refined_experiments.json" trial = "%03d" % trial_no print "processing trial %s" % trial if rgs_selected: rg_condition = lambda rg: rg in rgs_selected else: rg_condition = lambda rg: True rgs = {} # rungroups and associated runs for run in os.listdir(results_dir): if not (run.startswith("r") and run.split("r")[1].isdigit()): continue if runs_selected and run not in runs_selected: continue trgs = [ trg for trg in os.listdir(os.path.join(results_dir, run)) if (trg[:6] == trial + "_rg") and rg_condition(trg[-5:]) ] if not trgs: continue rungroups = set(map(lambda n: n.split("_")[1], trgs)) for rg in rungroups: if rg not in rgs.keys(): rgs[rg] = [run] else: rgs[rg].append(run) rg_ch_nums_sizes = {} rg_contents = {} for rg, runs in rgs.iteritems(): n_img = 0 trg = trial + "_" + rg rg_contents[rg] = [] for run in runs: try: contents = os.listdir( os.path.join(results_dir, run, trg, "out")) except OSError: print "skipping run %s missing out directory" % run continue abs_contents = [ os.path.join(results_dir, run, trg, "out", c) for c in contents ] rg_contents[rg].extend(abs_contents) expts = [c for c in contents if c.endswith(expt_ending)] n_img += len(expts) if n_img == 0: print "no images found for %s" % rg del rg_contents[rg] continue n_chunks = int(math.ceil(n_img / max_size)) chunk_size = int(math.ceil(n_img / n_chunks)) rg_ch_nums_sizes[rg] = (n_chunks, chunk_size) if len(rg_contents) == 0: raise Sorry, "no DIALS integration results found." rg_chunks = {} for rg, nst in rg_ch_nums_sizes.iteritems(): num, size = nst rg_chunks[rg] = [] contents = rg_contents[rg] expts = [c for c in contents if c.endswith(expt_ending)] refls = [c for c in contents if c.endswith(refl_ending)] expts, refls = match_dials_files(expts, refls, expt_ending, refl_ending) if stripe: for i in xrange(num): expts_stripe = expts[i::num] refls_stripe = refls[i::num] rg_chunks[rg].append((expts_stripe, refls_stripe)) print "striped %d experiments in %s with %d experiments per stripe and %d stripes" % \ (len(expts), rg, len(rg_chunks[rg][0][0]), len(rg_chunks[rg])) else: for i in xrange(num): expts_chunk = expts[i * size:(i + 1) * size] refls_chunk = refls[i * size:(i + 1) * size] rg_chunks[rg].append((expts_chunk, refls_chunk)) print "chunked %d experiments in %s with %d experiments per chunk and %d chunks" % \ (len(expts), rg, len(rg_chunks[rg][0][0]), len(rg_chunks[rg])) return rg_chunks