def scan_for_properties(pathname): properties = [] if os.path.isdir(pathname): pathnames = scan_for_regular_files(pathname) for pathname in pathnames: properties += scan_for_properties(pathname) else: # See if we can open the file as a LUE dataset. If not, issue a # warning. If so, obtain the internal paths of properties. try: dataset = lue.open_dataset(pathname, lue.access_flag.ro) properties += [ Property(pathname, property_pathname) for property_pathname in scan_phenomena_for_properties(dataset.phenomena) ] properties += [ Property(pathname, property_pathname) for property_pathname in scan_universes_for_properties(dataset.universes) ] except RuntimeError: pass print("Skipping non-LUE file {}".format(pathname)) return properties
def describe_datasets( pathnames, indent=0): print_message(indent, "datasets") indent += 1 for pathname in pathnames: dataset = lue.open_dataset(pathname) describe_dataset(dataset, indent)
def get_nrrow_nrcol_west_south_north_east(hdf5file, phenomena_name): dataset = lue.open_dataset(hdf5file, "r") phenomenon = dataset.phenomena[phenomena_name] pset = dataset.phenomena[phenomena_name].property_sets["area"] nr_rows = pset["band_1"].space_discretization.values[:][0][0] nr_cols = pset["band_1"].space_discretization.values[:][0][1] nl_west = pset.domain.space.items[:][0][0] nl_south = pset.domain.space.items[:][0][1] nl_north = pset.domain.space.items[:][0][3] nl_east = pset.domain.space.items[:][0][2] return nr_rows, nr_cols, nl_west, nl_south, nl_north, nl_east
def assertDatasetIsValid(self, dataset): """ Validate *dataset* """ if isinstance(dataset, str): self.assertTrue(os.path.exists(dataset_pathname)) dataset = lue.open_dataset(dataset_pathname) try: lue.assert_is_valid(dataset, fail_on_warning=True) except RuntimeError as exception: self.fail("dataset {} is not valid\n{}".format( dataset.pathname, exception))
def post_process_benchmarks(lue_pathname): lue_dataset = lue.open_dataset(lue_pathname) lue_benchmark = lue_dataset.phenomena["benchmark"] lue_meta_information = \ lue_benchmark.collection_property_sets["meta_information"] lue_name = lue_meta_information.properties["name"] lue_system_name = lue_meta_information.properties["system_name"] benchmark_name = lue_name.value[:] assert (len(benchmark_name) == 1) benchmark_name = benchmark_name[0] time_point = "todo" system_name = lue_system_name.value[:] assert (len(system_name) == 1) system_name = system_name[0] lue_measurement = lue_benchmark.property_sets["measurement"] lue_nr_localities = lue_measurement.properties["nr_localities"] lue_nr_threads = lue_measurement.properties["nr_threads"] lue_work_size = lue_measurement.properties["work_size"] lue_duration = lue_measurement.properties["duration"] nr_localities = lue_nr_localities.value[:] nr_measurements = len(nr_localities) nr_threads = lue_nr_threads.value[:] assert (len(nr_threads) == nr_measurements) work_size = lue_work_size.value[:] assert (len(work_size) == nr_measurements) duration = lue_duration.value[:] assert (len(duration) == nr_measurements) nr_durations = len(duration[0]) # Set up data frames # The (default) index is the index of the benchmark environment = pd.DataFrame({ "nr_localities": nr_localities, "nr_threads": nr_threads, "work_size": work_size, }) # Per benchmark a series. Each series contains all duration measurements. # These series are concatenated in one long series containing the # durations for all benchmarks. The index contains the index of # the benchmark. durations = [ pd.Series(duration[b], index=nr_durations * [b]) for b in range(nr_measurements) ] durations = pd.DataFrame({"duration": pd.concat(durations)}) nr_equal_work_sizes = \ (environment["work_size"] == environment["work_size"][0]).sum() constant_work_size = nr_equal_work_sizes == nr_measurements if constant_work_size: post_process_strong_scaling_benchmarks(benchmark_name, time_point, system_name, environment, durations) else: post_process_weak_scaling_benchmarks(benchmark_name, time_point, system_name, environment, durations)
def initial(self): self.startmap = scalar(0) self.startmap_home = scalar(0) self.lue_name = os.path.join( "LUE", "exposure_{0}.lue".format(str(self.currentSampleNumber()))) self.currentDate = self.startDate #pcraster.setrandomseed(self.currentSampleNumber()*1000) #random.seed(self.currentSampleNumber()*1000) self.workdf = self.workdf.sample( frac=1, replace=True, random_state=self.currentSampleNumber() * 1000) #randomly sample working locations self.work_realisation = os.path.join(str(self.currentSampleNumber()), "work_realisation.csv") self.workloc = os.path.join(str(self.currentSampleNumber()), "work_loc.map") # for testing self.workdf.to_csv(self.work_realisation, header=False) #save each realisation cmd = "col2map -S -s, --clone {0} -x 2 -y 3 -v 1 {1} {2} ".format( self.road_length_5000_file, self.work_realisation, self.workloc) subprocess.check_call(cmd, shell=True) #get extent form hdf file nr_rows, nr_cols, nl_west, nl_south, nl_north, nl_east = get_nrrow_nrcol_west_south_north_east( self.hdf5file, self.phenomena_name) cellsize = (nl_east - nl_west) / nr_cols self.window_size_x = int(nr_rows) self.window_size_y = int(nr_cols) print "input dataset:", nl_west, nl_south, nl_north, nl_east, nr_rows, nr_cols, cellsize, self.window_size_x, self.window_size_y # Here create the new dataset if LUE does not exists. if os.path.isfile(self.lue_name) == False: dataset = lue.create_dataset(self.lue_name) # add phenomenon phenomenon_exposure = dataset.add_phenomenon(self.lue_phenomena) #add propertyset ps_points = create_propertyset(phenomenon_exposure, self.lue_ps_points) ps_areas = create_propertyset(phenomenon_exposure, self.lue_ps_area) #load properties # ids for the properties are necessary for now ids_front = ps_points.reserve(self.nr_locations) ids_area = ps_areas.reserve(self.nr_locations) # assign a unique id ids_front[:] = range(0, self.nr_locations) ids_area[:] = range(0, self.nr_locations) load_route_LUE(ps_areas, self.nr_locations, self.homedf, self.workdf, self.window_size_x, self.window_size_y, self.lue_p_area_route) #load work and home locations to LUE load_home_work_LUE(ps_points, self.nr_locations, self.homedf, self.workdf, self.lue_p_points_home, self.lue_p_points_home_rowcol, self.lue_p_points_work, self.lue_p_points_work_rowcol, nl_west, nl_north, cellsize) lue.assert_is_valid(dataset) #open LUE for use dataset = lue.open_dataset(self.lue_name, "w") phenomenon = dataset.phenomena[self.lue_phenomena] self.route_set = phenomenon.property_sets[self.lue_ps_area] self.pslocations = phenomenon.property_sets[self.lue_ps_points] self.timestep = 1 #self.exposure_Map = scalar(1) # self.array0= numpy.zeros((self.window_size_x * self.window_size_y,), dtype=numpy.float32) # self.clone_array_home = self.array0.reshape(self.window_size_x,self.window_size_y) # for i in range(1, self.nr_locations): # home_loc_row = int(self.pslocations[self.lue_p_points_home_rowcol].values[i][:][0]) # home_loc_col = int(self.pslocations[self.lue_p_points_home_rowcol].values[i][:][1]) # # w_loc_row = self.pslocations[self.lue_p_points_work_rowcol].values[i][:][0] # # w_loc_col = self.pslocations[self.lue_p_points_work_rowcol].values[i][:][1] # # home_loc_row1 = self.pslocations[self.lue_p_points_home].values[i][:][0]) # # home_loc_col2 = self.pslocations[self.lue_p_points_home].values[i][:][1]) # # print home_loc_row, home_loc_col, home_loc_row1, home_loc_col2, w_loc_col,w_loc_row # self.clone_array_home[ home_loc_row, home_loc_col ]=1 # self.startcell_home = numpy2pcr(Boolean, self.clone_array_home, 0.0) # for homemakers self.test_dest = scalar(0)
def test_case_study(self): # Time series as implemented here: # - Discharge at catchment outlets # - Located at fixed points in space # - Variable number of outlets per time cell # - Presence of outlets is discretized within multiple time boxes # - Time domain contains time cells # - Space domain contains space points # - Property values are same_shape::constant_shape (shape of value is # related to what is stored per cell) # - Property values are not discretized # - Per time cell the set of active objects is tracked # - Use this approach if the active set is variable within a # time box # - - Additional storage required for tracking active sets, # compared to Time series I # - + Possible to let objects be 'born' and 'die' during # iterative simulation dataset = lue.create_dataset("outlets2.lue") phenomenon = dataset.add_phenomenon("areas") # Assume we are simulating some temporal variable (discharge at # catchment outlets). # The existance of the objects is modelled using time cells, # which are discretized time boxes (daily time steps). Per cell we # can store which objects are active. # Property values are located in time at time cells. # Property values are located in space at stationary space points. # Time domain time_configuration = lue.TimeConfiguration(lue.TimeDomainItemType.cell) epoch = lue.Epoch(lue.Epoch.Kind.common_era, "2019-01-01", lue.Calendar.gregorian) clock = lue.Clock(epoch, lue.Unit.day, 1) time_coordinate_datatype = lue.dtype.TickPeriodCount # Space domain space_configuration = lue.SpaceConfiguration( lue.Mobility.stationary, lue.SpaceDomainItemType.point) space_coordinate_datatype = numpy.dtype(numpy.float32) rank = 2 # Property set outlet_points = phenomenon.add_property_set("outlets", time_configuration, clock, space_configuration, space_coordinate_datatype, rank) time_domain = outlet_points.time_domain space_domain = outlet_points.space_domain active_set_index = outlet_points.object_tracker.active_set_index active_object_id = outlet_points.object_tracker.active_object_id # Property discharge_datatype = numpy.dtype(numpy.float32) discharge = outlet_points.add_property( "discharge", dtype=discharge_datatype, shape=(1, ), value_variability=lue.ValueVariability.variable) nr_time_boxes = 5 max_nr_objects = 100 # Iterate over the time boxes for t in range(nr_time_boxes): # Store additional time box and count time_box = numpy.array([t, t + 1], dtype=time_coordinate_datatype) time_domain.value.expand(1)[-1] = time_box count = int(10 * random.random()) time_domain.value.count.expand(1)[-1] = count # Iterate over the time cells within each time box for c in range(count): # Store IDs of objects in the active set object_index = active_object_id.nr_ids active_set_index.expand(1)[-1] = object_index nr_objects = int(random.random() * max_nr_objects) object_id = numpy.empty(nr_objects, dtype=lue.dtype.ID) lue.test.select_random_ids(object_id, max_nr_objects) active_object_id.expand(nr_objects)[object_index:] = object_id # Store property values of active objects discharge_values = \ numpy.arange(nr_objects, dtype=discharge_datatype) discharge.value.expand(nr_objects)[object_index:] = \ discharge_values lue.assert_is_valid(dataset) del dataset dataset = lue.open_dataset("outlets2.lue") phenomenon = dataset.phenomena["areas"] outlet_points = phenomenon.property_sets["outlets"] time_domain = outlet_points.time_domain clock = time_domain.clock self.assertEqual(clock.epoch.kind, lue.Epoch.Kind.common_era) self.assertEqual(clock.epoch.origin, "2019-01-01") self.assertEqual(clock.epoch.calendar, lue.Calendar.gregorian) self.assertEqual(clock.unit, lue.Unit.day) self.assertEqual(clock.nr_units, 1)
def post_process_raw_results( lue_dataset_pathname, plot_pathname): """ Create plots and tables from raw benchmark results """ lue_dataset = lue.open_dataset(lue_dataset_pathname) lue_benchmark = lue_dataset.phenomena["benchmark"] lue_meta_information = \ lue_benchmark.collection_property_sets["meta_information"] lue_measurement = lue_benchmark.property_sets["measurement"] meta_information = meta_information_dataframe(lue_meta_information) name = meta_information.name[0] system_name = meta_information.system_name[0] worker_type = meta_information.worker_type[0] nr_time_steps = meta_information.nr_time_steps[0] nr_arrays, rank = \ lue_meta_information.properties["array_shape"].value.shape assert nr_arrays == 1 nr_benchmarks, count = lue_measurement.properties["duration"].value.shape measurement = measurement_dataframe(lue_measurement) # The time point at which the experiment was performed is the epoch # of the time domain used to store the durations lue_clock = lue_measurement.time_domain.clock assert lue_clock.nr_units == 1 time_point_units = lue_clock.unit lue_epoch = lue_clock.epoch assert lue_epoch.kind == lue.Epoch.Kind.common_era assert lue_epoch.calendar == lue.Calendar.gregorian time_point = dateutil.parser.isoparse(lue_epoch.origin) # String containing time point in local time zone and conventions # time_point = time_point.astimezone(tzlocal.get_localzone()).strftime("%c") time_point = time_point.strftime("%c") nr_workers = measurement["nr_workers"] duration_labels = ["duration_{}".format(i) for i in range(count)] # t1 = duration using one worker t1 = measurement.loc[nr_workers == 1].filter(items=duration_labels) # t1 = [t1["duration_{}".format(i)][0] for i in range(count)] t1 = [t1.iat[0, i] for i in range(count)] for i in range(count): # Best case: duration stays constant with increasing the number of # workers and amount of work (and keeping the amount of work / # worker constant) # 100% parallel code, but without parallelization overhead measurement["linear_duration_{}".format(i)] = \ [t1[i] for b in range(nr_benchmarks)] # Worst case: duration scales with number of workers # 100% serial code, but without parallelization overhead measurement["serial_duration_{}".format(i)] = t1[i] * nr_workers ### # slow_down = tn / linear_duration ### measurement["relative_slow_down_{}".format(i)] = \ ### (measurement["duration_{}".format(i)] / \ ### measurement["linear_duration_{}".format(i)]) - 1 ### measurement["linear_relative_slow_down_{}".format(i)] = \ ### (measurement["linear_duration_{}".format(i)] / \ ### measurement["linear_duration_{}".format(i)]) - 1 ### measurement["serial_relative_slow_down_{}".format(i)] = \ ### (measurement["serial_duration_{}".format(i)] / \ ### measurement["linear_duration_{}".format(i)]) - 1 # efficiency = 100% * t1 / tn measurement["efficiency_{}".format(i)] = \ 100 * t1[i] / measurement["duration_{}".format(i)] measurement["linear_efficiency_{}".format(i)] = \ 100 * t1[i] / measurement["linear_duration_{}".format(i)] measurement["serial_efficiency_{}".format(i)] = \ 100 * t1[i] / measurement["serial_duration_{}".format(i)] # lups = nr_time_steps * nr_elements / duration # In the case of weak scaling, the nr_elements increases with the # nr_workers. Ideally, LUPS increases linearly with the nr_workers. measurement["lups_{}".format(i)] = \ nr_time_steps * measurement["nr_elements"] / \ measurement["duration_{}".format(i)] measurement["linear_lups_{}".format(i)] = \ nr_time_steps * measurement["nr_elements"] / \ measurement["linear_duration_{}".format(i)] measurement["serial_lups_{}".format(i)] = \ nr_time_steps * measurement["nr_elements"] / \ measurement["serial_duration_{}".format(i)] # https://xkcd.com/color/rgb/ serial_color = sns.xkcd_rgb["pale red"] linear_color = sns.xkcd_rgb["medium green"] actual_color = sns.xkcd_rgb["denim blue"] nr_plot_rows = 2 nr_plot_cols = 2 plot_width = 8 # Inches... plot_height = 6 # Inches... figure, axes = plt.subplots( nrows=nr_plot_rows, ncols=nr_plot_cols, figsize=(nr_plot_cols * plot_width, nr_plot_rows * plot_height), squeeze=False, sharex=False, ) plot_row, plot_col = 0, 0 # duration by nr_workers linear_duration = select_data_for_plot( measurement, "linear_duration", count) serial_duration = select_data_for_plot( measurement, "serial_duration", count) duration = select_data_for_plot( measurement, "duration", count) sns.lineplot( data=linear_duration, x="nr_workers", y="linear_duration", ax=axes[plot_row, plot_col], color=linear_color) sns.lineplot( data=serial_duration, x="nr_workers", y="serial_duration", ax=axes[plot_row, plot_col], color=serial_color) sns.lineplot( data=duration, x="nr_workers", y="duration", ax=axes[plot_row, plot_col], color=actual_color) axes[plot_row, plot_col].set_ylabel( u"duration ({}) ± 95% ci (count={})".format( time_point_units, count)) axes[plot_row, plot_col].yaxis.set_major_formatter( ticker.FuncFormatter( lambda y, pos: format_duration(y))) plot_row, plot_col = 0, 1 linear_efficiency = select_data_for_plot( measurement, "linear_efficiency", count) serial_efficiency = select_data_for_plot( measurement, "serial_efficiency", count) efficiency = select_data_for_plot( measurement, "efficiency", count) sns.lineplot( data=linear_efficiency, x="nr_workers", y="linear_efficiency", ax=axes[plot_row, plot_col], color=linear_color) sns.lineplot( data=serial_efficiency, x="nr_workers", y="serial_efficiency", ax=axes[plot_row, plot_col], color=serial_color) sns.lineplot( data=efficiency, x="nr_workers", y="efficiency", ax=axes[plot_row, plot_col], color=actual_color) axes[plot_row, plot_col].set_ylim(0, 110) axes[plot_row, plot_col].set_ylabel("efficiency (%)") plot_row, plot_col = 1, 0 # lups by nr_workers linear_lups = select_data_for_plot( measurement, "linear_lups", count) serial_lups = select_data_for_plot( measurement, "serial_lups", count) lups = select_data_for_plot( measurement, "lups", count) sns.lineplot( data=linear_lups, x="nr_workers", y="linear_lups", ax=axes[plot_row, plot_col], color=linear_color) sns.lineplot( data=serial_lups, x="nr_workers", y="serial_lups", ax=axes[plot_row, plot_col], color=serial_color) sns.lineplot( data=lups, x="nr_workers", y="lups", ax=axes[plot_row, plot_col], color=actual_color) axes[plot_row, plot_col].set_ylabel("LUPS") plot_row, plot_col = 1, 1 axes[plot_row, plot_col].axis("off") for plot_row in range(nr_plot_rows): for plot_col in range(nr_plot_cols): axes[plot_row, plot_col].xaxis.set_major_formatter( ticker.FuncFormatter( lambda x, pos: format_nr_workers(x))) axes[plot_row, plot_col].set_xlabel( "workers ({})".format(worker_type)) axes[plot_row, plot_col].grid() figure.legend(labels=["linear", "serial", "actual"]) array_shape_per_worker = \ lue_meta_information.properties["array_shape"].value[0] partition_shape = \ lue_meta_information.properties["partition_shape"].value[0] figure.suptitle( "{}, {}, {}\n" "Weak scaling experiment on {} array per worker and {} partitions" .format( name, system_name, time_point, "x".join([str(extent) for extent in array_shape_per_worker]), "x".join([str(extent) for extent in partition_shape]), ) ) # plt.tight_layout() plt.savefig(plot_pathname)