def main(args): config = workloadAnalysis_pd.Config(args).config comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() mpi_size = comm.Get_size() datasets = {} types = {} fds = [] for fname in config.files: print "reading data from %s" % fname fd = h5py.File(fname, 'r') for dset in fd: data = fd[dset][:] if dset not in datasets: datasets[dset] = [] if dset not in types: types[dset] = data.dtype if data.dtype != types[dset]: print "ERROR! FAIL! dtypes for %s do not match!" % dset, data.dtype, types[dset] datasets[dset].append(data) fd.close() fd = h5py.File(config.output, 'w') for dset in datasets: data = np.concatenate(datasets[dset]) keys = [] print data for col in data.dtype.names: if not col.endswith("_sum") and not col.endswith("_count") and not col.endswith("_histogram"): keys.append(col) print "merging %s on " % dset, keys data = workloadAnalysis_pd.mpiMergeDataset(data, keys, comm, mpi_rank, mpi_size) print data outdata = fd.create_dataset(dset, (data.size,), dtype=data.dtype) outdata[:] = data[:] fd.close()
def main(args): config = workloadAnalysis_pd.Config(args).config comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() mpi_size = comm.Get_size() print config dataset = [] for filename in config.files: fd = h5py.File(filename, 'r') dset = fd[config.summary][:] dataset.append(dset) fd.close() dataset = np.concatenate(dataset) login_mask = np.zeros(dataset.size, dtype=bool) gens = {} print "TOTAL CPU_SUM: ", np.sum(dataset['cpu_sum']) ## perform any needed generalizations for gen in config.generalizations: print "Starting on generalization: " + gen['name'] col = 'generalization_%s' % gen['name'] gens[col] = dataset[gen['column']].copy() overall_mask = np.zeros(dataset.size, dtype=bool) for catname in gen['eval_order']: regexes = gen['categories'][catname] mask = np.zeros(dataset.size, dtype=bool) sys.stdout.write("starting on category:%s; " % catname) for regex in regexes: reg = re.compile(regex) regMatch = np.vectorize(lambda x: bool(reg.match(x))) if col in gens: mask |= regMatch(gens[col]) else: mask |= regMatch(dataset[col]) sys.stdout.write("%d " % np.sum(mask)) overall_mask |= mask print "; (%d)" % np.sum(overall_mask) gens[col][mask] = "category_%s" % catname if gen['other_category']: overall_mask = np.invert(overall_mask) gens[col][overall_mask] = "category_other" print "have %s entries: " % col, np.unique(gens[col]) print print "types: ", dataset.dtype print if mpi_rank == 0: fd = h5py.File(config.output, "w") index_cols = [] for analysis in config.analyses: analysis_data = dataset.copy() index_cols = [] for axis in analysis['axes']: for idxcol in axis: if idxcol not in index_cols: index_cols.append(idxcol) remove_list = [] for col in analysis_data.dtype.names: if col.endswith("_sum") or col.endswith("_count") or col.endswith("_histogram") or col in index_cols: pass else: remove_list.append(col) ## copy generalizations into dataset dtype_list = [] for key in analysis_data.dtype.fields: if key not in remove_list: dtype_list.append( (key, analysis_data.dtype.fields[key][0],) ) dtype_list.extend([ (key, gens[key].dtype,) for key in gens ]) new_dataset = np.zeros(analysis_data.size, dtype=np.dtype(dtype_list)) for col in analysis_data.dtype.names: if col in remove_list: continue new_dataset[col] = analysis_data[col] for col in gens: new_dataset[col] = gens[col][:] analysis_data = new_dataset print "making most detailed version of %s analysis using " % analysis['name'], index_cols count = analysis_data.size print index_cols print analysis_data.dtype.names analysis_data = workloadAnalysis_pd.mpiMergeDataset(analysis_data, index_cols, comm, mpi_rank, mpi_size) print "done. went from %d to %d entries" % (count, analysis_data.size) print "[%d] cpu_sum after: " % mpi_rank, np.sum(analysis_data['cpu_sum']) for col in index_cols: print col, np.unique(analysis_data[col]) print "starting axis summarizations:" for idx,axis in enumerate(analysis['axes']): dtype_list = [] for (colidx,name) in enumerate(analysis_data.dtype.names): if name in axis or name.endswith("_sum") or name.endswith("_count") or name.endswith("_histogram"): dtype_list.append( (name, analysis_data.dtype[colidx],) ) axis_data = np.zeros(analysis_data.size, dtype=np.dtype(dtype_list)) for name in axis_data.dtype.names: axis_data[name][:] = analysis_data[name].copy() axis_data = workloadAnalysis_pd.mpiMergeDataset(axis_data, axis, comm, mpi_rank, mpi_size) print "[%d] %s cpu_sum after axis summarization: " % (mpi_rank, analysis['axisNames'][idx]), np.sum(axis_data['cpu_sum']) print "completed axis %s with %d records, writing data" % (analysis['axisNames'][idx], axis_data.size) if mpi_rank == 0: dset = fd.create_dataset('%s_%s' % (analysis['name'], analysis['axisNames'][idx]), (axis_data.size,), dtype=axis_data.dtype) dset[:] = axis_data[:] print "completed writing data out." if mpi_rank == 0: fd.close()