def test_run(self): # The paths work as follows: opus_matsim.__path__ is the path of the opus_matsim python module. So we can use that # as anchor ... config_location = os.path.join(opus_matsim.__path__[0], 'tests') print "location: ", config_location run_config = XMLConfiguration( os.path.join(config_location,"test_config.xml")).get_run_configuration("Test") run_config['creating_baseyear_cache_configuration'].cache_directory_root = self.temp_dir run_config['creating_baseyear_cache_configuration'].baseyear_cache.existing_cache_to_copy = \ os.path.join(opus_matsim.__path__[0], 'tests', 'testdata', 'base_year_data') # insert_auto_generated_cache_directory... does things I don't understand. Need to do the following to obtain consistent # behavior independent from the file root: run_config['cache_directory'] = None insert_auto_generated_cache_directory_if_needed(run_config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory = run_config['cache_directory'], configuration = run_config) run_manager.run_run(run_config, run_as_multiprocess = True ) self.assert_(True) self.cleanup_test_run()
def testName(self): print "entering test_run" logger.log_status('Preparing MATsim test run ...') # unzip MATSim files matsim_zip = ExtractZipFile(self.matsim_source, self.destination) matsim_zip.extract() matsim_extracted_files = os.path.join(self.destination, 'MATSimTestClasses') # location of unziped MATSim files # unzip base_year_cache base_year_data_zip = ExtractZipFile(self.base_year_data_source, self.destination) base_year_data_zip.extract() base_year_data_extracted_files = os.path.join(self.destination, 'base_year_data') # location of unziped base_year_cache # updating location of base_year_data self.run_config['creating_baseyear_cache_configuration'].cache_directory_root = self.destination self.run_config['creating_baseyear_cache_configuration'].baseyear_cache.existing_cache_to_copy = base_year_data_extracted_files self.run_config['cache_directory'] = base_year_data_extracted_files self.run_config.add('matsim_files', matsim_extracted_files) self.run_config.add('matsim_config', self.matsim_config_full) self.run_config.add('root', self.destination) insert_auto_generated_cache_directory_if_needed(self.run_config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory = self.run_config['cache_directory'], configuration = self.run_config) logger.log_status('Strating UrbanSim run ... ') run_manager.run_run(self.run_config, run_as_multiprocess = True ) # after the UrbanSim run the travel data sets schould be equal # self.assertTrue( self.compare_travel_data_sets() ) logger.log_status('... UrbanSim run finished.') print "leaving test_run"
def _do_run_simple_test_run(caller, temp_dir, config, end_year=None): """Runs model system with a single model (for speed). Sets the .resources property of the caller before starting the run. """ runs_manager = RunManager(config) run_configuration = _get_run_config(temp_dir=temp_dir) insert_auto_generated_cache_directory_if_needed(run_configuration) run_configuration[ 'creating_baseyear_cache_configuration'].cache_directory_root = temp_dir run_configuration['models'] = ['land_price_model'] if end_year is not None: run_configuration['years'] = (run_configuration['years'][0], end_year) SessionConfiguration( new_instance=True, package_order=run_configuration['dataset_pool_configuration']. package_order, in_storage=AttributeCache()) insert_auto_generated_cache_directory_if_needed(run_configuration) caller.resources = run_configuration runs_manager.setup_new_run( cache_directory=run_configuration['cache_directory'], configuration=run_configuration) runs_manager.run_run(run_configuration)
def test_run(self): # The paths work as follows: opus_matsim.__path__ is the path of the opus_matsim python module. So we can use that # as anchor ... config_location = os.path.join(opus_matsim.__path__[0], 'tests') print "location: ", config_location run_config = XMLConfiguration( os.path.join(config_location, "test_config.xml")).get_run_configuration("Test") run_config[ 'creating_baseyear_cache_configuration'].cache_directory_root = self.temp_dir run_config['creating_baseyear_cache_configuration'].baseyear_cache.existing_cache_to_copy = \ os.path.join(opus_matsim.__path__[0], 'tests', 'testdata', 'base_year_data') # insert_auto_generated_cache_directory... does things I don't understand. Need to do the following to obtain consistent # behavior independent from the file root: run_config['cache_directory'] = None insert_auto_generated_cache_directory_if_needed(run_config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run( cache_directory=run_config['cache_directory'], configuration=run_config) run_manager.run_run(run_config, run_as_multiprocess=True) self.assert_(True) self.cleanup_test_run()
def test_run(self): print "Entering test run" run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory = self.config['cache_directory'],configuration = self.config) run_manager.run_run(self.config, run_as_multiprocess = True ) print "Leaving test run"
def prepare_run_manager(option_group=None): if option_group is None: option_group = StartRunOptionGroup() parser = option_group.parser options, args = option_group.parse() run_manager = RunManager( option_group.get_services_database_configuration(options)) if options.pickled_resource_file is not None: f = file(options.pickled_resource_file, 'r') try: config = pickle.load(f) finally: f.close() elif options.configuration_path is not None: opus_path = options.configuration_path try: config = get_config_from_opus_path(opus_path) except ImportError: # TODO: Once all fully-specified configurations are stored as classes, # get rid of this use. import_stmt = 'from %s import run_configuration as config' % opus_path exec(import_stmt) insert_auto_generated_cache_directory_if_needed(config) elif options.xml_configuration is not None: if options.scenario_name is None: parser.print_help() sys.exit(1) config = XMLConfiguration( options.xml_configuration).get_run_configuration( options.scenario_name) insert_auto_generated_cache_directory_if_needed(config) else: parser.print_help() sys.exit(1) if options.existing_cache_to_copy is not None: config[ 'creating_baseyear_cache_configuration'].cache_from_database = False config[ 'creating_baseyear_cache_configuration'].baseyear_cache = BaseyearCacheConfiguration( existing_cache_to_copy=options.existing_cache_to_copy, ) if options.years_to_cache is not None: config[ 'creating_baseyear_cache_configuration'].baseyear_cache.years_to_cache = eval( options.years_to_cache) if options.profile_filename is not None: config["profile_filename"] = options.profile_filename run_manager.setup_new_run(cache_directory=config['cache_directory'], configuration=config) return options, config, run_manager
def test_simulation(self): services_db = ServicesDatabaseConfiguration( database_name = 'services', database_configuration = 'services_database_server' ) run_manager = RunManager(services_db) run_as_multiprocess = True for scenario_name in ['psrc_baseline_test']: config = self.xml_config.get_run_configuration(scenario_name) insert_auto_generated_cache_directory_if_needed(config) run_manager.setup_new_run(cache_directory = config['cache_directory'], configuration = config) run_manager.run_run(config, run_as_multiprocess = run_as_multiprocess)
def run(self): logger.start_block() insert_auto_generated_cache_directory_if_needed(self.config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory = self.config['cache_directory'],configuration = self.config) run_manager.run_run(self.config, run_as_multiprocess = True ) logger.end_block()
def run(self, config, executable): #--config=opus_matsim/sustain_city/configs/seattle_parcel.xml --executable=Seattle_baseline config = XMLConfiguration(config).get_run_configuration(executable) insert_auto_generated_cache_directory_if_needed(config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory = config['cache_directory'],configuration = config) run_manager.run_run(config, run_as_multiprocess = True )
def test_simulation(self): services_db = ServicesDatabaseConfiguration( database_name = 'services', database_configuration = 'services_database_server' ) run_manager = RunManager(services_db) run_as_multiprocess = True for scenario_name in ['san_antonio_baseline_test']: config = self.xml_config.get_run_configuration(scenario_name) insert_auto_generated_cache_directory_if_needed(config) run_manager.setup_new_run(cache_directory = config['cache_directory'], configuration = config) run_manager.run_run(config, run_as_multiprocess = run_as_multiprocess)
def test_simulation(self): eugene_dir = __import__('eugene').__path__[0] xml_config = XMLConfiguration(os.path.join(eugene_dir, 'configs', 'eugene_gridcell.xml')) option_group = StartRunOptionGroup() parser = option_group.parser # simulate 0 command line arguments by passing in [] (options, _) = parser.parse_args([]) run_manager = RunManager(option_group.get_services_database_configuration(options)) run_section = xml_config.get_run_configuration('Eugene_baseline') insert_auto_generated_cache_directory_if_needed(run_section) run_manager.setup_new_run(cache_directory = run_section['cache_directory'], configuration = run_section) run_manager.run_run(run_section)
def prepare_run_manager(option_group=None): if option_group is None: option_group = StartRunOptionGroup() parser = option_group.parser options, args = option_group.parse() run_manager = RunManager(option_group.get_services_database_configuration(options)) if options.pickled_resource_file is not None: f = file(options.pickled_resource_file, 'r') try: config = pickle.load(f) finally: f.close() elif options.configuration_path is not None: opus_path = options.configuration_path try: config = get_config_from_opus_path(opus_path) except ImportError: # TODO: Once all fully-specified configurations are stored as classes, # get rid of this use. import_stmt = 'from %s import run_configuration as config' % opus_path exec(import_stmt) insert_auto_generated_cache_directory_if_needed(config) elif options.xml_configuration is not None: if options.scenario_name is None: parser.print_help() sys.exit(1) config = XMLConfiguration(options.xml_configuration).get_run_configuration(options.scenario_name) insert_auto_generated_cache_directory_if_needed(config) else: parser.print_help() sys.exit(1) if options.existing_cache_to_copy is not None: config['creating_baseyear_cache_configuration'].cache_from_database = False config['creating_baseyear_cache_configuration'].baseyear_cache = BaseyearCacheConfiguration( existing_cache_to_copy = options.existing_cache_to_copy, ) if options.years_to_cache is not None: config['creating_baseyear_cache_configuration'].baseyear_cache.years_to_cache = eval(options.years_to_cache) if options.profile_filename is not None: config["profile_filename"] = options.profile_filename run_manager.setup_new_run(cache_directory = config['cache_directory'], configuration = config) return options, config, run_manager
def test_simulation(self): # check that the simulation proceeds without crashing # open the configuration for seattle_parcel.xml seattle_parcel_dir = __import__('seattle_parcel').__path__[0] xml_config = XMLConfiguration(os.path.join(seattle_parcel_dir, 'configs', 'seattle_parcel.xml')) option_group = StartRunOptionGroup() parser = option_group.parser # simulate 0 command line arguments by passing in [] (options, _) = parser.parse_args([]) run_manager = RunManager(option_group.get_services_database_configuration(options)) run_section = xml_config.get_run_configuration('Seattle_baseline') insert_auto_generated_cache_directory_if_needed(run_section) run_manager.setup_new_run(cache_directory = run_section['cache_directory'], configuration = run_section) run_manager.run_run(run_section)
def test_simulation(self): eugene_dir = __import__('eugene').__path__[0] xml_config = XMLConfiguration( os.path.join(eugene_dir, 'configs', 'eugene_gridcell.xml')) option_group = StartRunOptionGroup() parser = option_group.parser # simulate 0 command line arguments by passing in [] (options, _) = parser.parse_args([]) run_manager = RunManager( option_group.get_services_database_configuration(options)) run_section = xml_config.get_run_configuration('Eugene_baseline') insert_auto_generated_cache_directory_if_needed(run_section) run_manager.setup_new_run( cache_directory=run_section['cache_directory'], configuration=run_section) run_manager.run_run(run_section)
def test_simulation(self): base_year_data_path = os.path.join(self.data_path, 'base_year_data') if not os.path.exists(base_year_data_path): os.makedirs(base_year_data_path) ftp_url = os.environ["FTP_URL"] file_name = os.path.split(ftp_url)[1] ftp_user = os.environ["FTP_USERNAME"] ftp_password = os.environ["FTP_PASSWORD"] #stdout, stderr = Popen("ls -la %s" % base_year_data_path, shell=True).communicate() #stdout, stderr = Popen("echo '%s'" % (base_year_data_path), stdout=PIPE).communicate() #print stdout try: Popen( """ cd %s; pwd; ls -la; echo wget --timestamping %s --ftp-user=%s --ftp-password=%s > /dev/null 2>&1; rm -rf 2008; unzip -o %s """ % (base_year_data_path, ftp_url, ftp_user, ftp_password, file_name), shell = True ).communicate() except: print "Error when downloading and unzipping file from %s." % ftp_url raise services_db = ServicesDatabaseConfiguration( database_name = 'services', database_configuration = 'services_database_server' ) run_manager = RunManager(services_db) run_as_multiprocess = True xml_config = XMLConfiguration(os.path.join(self.opus_home, 'project_configs', 'washtenaw_parcel.xml')) for scenario_name in ['washtenaw_baseline_test']: config = xml_config.get_run_configuration(scenario_name) insert_auto_generated_cache_directory_if_needed(config) # base_year = config['base_year'] # config['years_to_run'] = (base_year+1, base_year+2) run_manager.setup_new_run(cache_directory = config['cache_directory'], configuration = config) run_manager.run_run(config, run_as_multiprocess = run_as_multiprocess)
def _do_run_simple_test_run(caller, temp_dir, config, end_year=None): """Runs model system with a single model (for speed). Sets the .resources property of the caller before starting the run. """ runs_manager = RunManager(config) run_configuration = SubsetConfiguration() run_configuration['creating_baseyear_cache_configuration'].cache_directory_root = temp_dir run_configuration['models'] = ['land_price_model'] if end_year is not None: run_configuration['years'] = (run_configuration['years'][0], end_year) SessionConfiguration(new_instance=True, package_order=run_configuration['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) insert_auto_generated_cache_directory_if_needed(run_configuration) caller.resources = run_configuration runs_manager.setup_new_run(cache_directory = run_configuration['cache_directory'], configuration = run_configuration) runs_manager.run_run(run_configuration)
def testName(self): print "entering test_run" logger.log_status("Preparing MATsim test run ...") # unzip MATSim files matsim_zip = ExtractZipFile(self.matsim_source, self.destination) matsim_zip.extract() matsim_extracted_files = os.path.join(self.destination, "MATSimTestClasses") # location of unziped MATSim files # unzip base_year_cache base_year_data_zip = ExtractZipFile(self.base_year_data_source, self.destination) base_year_data_zip.extract() base_year_data_extracted_files = os.path.join( self.destination, "base_year_data" ) # location of unziped base_year_cache # updating location of base_year_data self.run_config["creating_baseyear_cache_configuration"].cache_directory_root = self.destination self.run_config[ "creating_baseyear_cache_configuration" ].baseyear_cache.existing_cache_to_copy = base_year_data_extracted_files self.run_config["cache_directory"] = base_year_data_extracted_files self.run_config.add("matsim_files", matsim_extracted_files) self.run_config.add("matsim_config", self.matsim_config_full) self.run_config.add("root", self.destination) insert_auto_generated_cache_directory_if_needed(self.run_config) run_manager = RunManager(ServicesDatabaseConfiguration()) run_manager.setup_new_run(cache_directory=self.run_config["cache_directory"], configuration=self.run_config) logger.log_status("Strating UrbanSim run ... ") run_manager.run_run(self.run_config, run_as_multiprocess=True) # after the UrbanSim run the travel data sets schould be equal # self.assertTrue( self.compare_travel_data_sets() ) logger.log_status("... UrbanSim run finished.") print "leaving test_run"
class Calibration(object): """ Class to calibrate UrbanSim model coefficients. """ def __init__( self, xml_config, scenario, calib_datasets, target_expression, target_file, subset=None, subset_patterns=None, skip_cache_cleanup=False, log_directory=None, ): """ - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml' - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g. {'establishment_location_choice_model_coefficients': 'estimate'} - target_expression: opus expression computing values from prediction to be compared with targets - target_file: name of csv file providing targets - subset: dictionary specifying the dataset to be calibrated, {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]} subset and subset_patterns can not be both specified for the same dataset - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} subset and subset_patterns can not be both specified for the same dataset """ self.target_expression = target_expression self.target = self.read_target(target_file) self.run_manager = None self.xml_config = xml_config self.scenario = scenario self.skip_cache_cleanup = skip_cache_cleanup self.run_id, self.cache_directory = self.init_run() self.run_ids = [self.run_id] # allow starting of multiple runs for parallel optimization self.log_directory = log_directory if self.log_directory is None: self.log_directory = self.cache_directory # legacy log_file = os.path.join(self.log_directory, "calibration.log") logger.enable_file_logging(log_file) dict_config = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario) ## get parameters from config self.base_year = dict_config["base_year"] self.start_year, self.end_year = dict_config["years"] self.project_name = dict_config["project_name"] self.package_order = dict_config["dataset_pool_configuration"].package_order @log_block("Start Calibration") def run(self, optimizer="lbfgsb", results_pickle_prefix="calib", optimizer_kwargs={}): """ Call specifized optimizer to calibrate Arguments: - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.) - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved Returns: - the results from the opimizater - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified """ simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache ).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []}) assert ( subset is None or subset.get(dataset_name, None) is None or subset_patterns is None or subset_patterns.get(dataset_name, None) is None ) if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]]) else: index = arange(dataset.size(), dtype="i") calib_datasets[dataset_name] = [dataset, calib_attr, index] init_v = array([], dtype="f8") for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: init_v = np.concatenate((init_v, dataset[calib_attr][index])) elif type(calib_attr) in (list, tuple): for attr in calib_attr: init_v = np.concatenate((init_v, dataset[attr][index])) else: raise TypeError, "Unrecongized data type in calib_datasets" t0 = time.time() if is_parallelizable == True: set_parallel(True) print OKBLUE + "\noptimizer = {} (is_parallel = {})".format(optimizer, is_parallelizable) + ENDC print OKBLUE + "-------------------------------------------------------\n" + ENDC if optimizer == "bfgs": default_kwargs = { "fprime": None, "epsilon": 1e-08, "maxiter": None, "full_output": 1, "disp": 1, "retall": 0, "callback": None, } optimizer_func = fmin_bfgs elif optimizer == "lbfgsb": default_kwargs = {"fprime": None, "approx_grad": True, "bounds": None, "factr": 1e12, "iprint": 1} optimizer_func = fmin_l_bfgs_b elif optimizer == "anneal": default_kwargs = { "schedule": "fast", "full_output": 1, "T0": None, "Tf": 1e-12, "maxeval": None, "maxaccept": None, "maxiter": 400, "boltzmann": 1.0, "learn_rate": 0.5, "feps": 1e-06, "quench": 1.0, "m": 1.0, "n": 1.0, "lower": -1, "upper": 1, "dwell": 50, "disp": True, } optimizer_func = anneal elif optimizer == "panneal": default_kwargs = { "schedule": "fast", "full_output": 1, "T0": None, "Tf": 1e-12, "maxeval": None, "maxaccept": None, "maxiter": 400, "boltzmann": 1.0, "learn_rate": 0.5, "feps": 1e-06, "quench": 1.0, "m": 1.0, "n": 1.0, "lower": -1, "upper": 1, "dwell": 50, "disp": True, "cores": 24, "interv": 20, } optimizer_func = panneal else: raise ValueError, "Unrecognized optimizer {}".format(optimizer) default_kwargs.update(optimizer_kwargs) results = optimizer_func(self.target_func, copy(init_v), **default_kwargs) duration = time.time() - t0 if results_pickle_prefix is not None: pickle_file = "{}_{}.pickle".format(results_pickle_prefix, optimizer) pickle_file = os.path.join(self.log_directory, pickle_file) pickle.dump(results, open(pickle_file, "wb")) if is_parallelizable == True: set_parallel(False) logger.log_status("init target_func: {}".format(self.target_func(init_v))) logger.log_status("end target_func: {}".format(results[:])) # which one? logger.log_status("outputs from optimizer: {}".format(results)) logger.log_status("Execution time: {}".format(duration)) def init_run(self, create_baseyear_cache=True): """ init run, get run_id & cache_directory. """ ##avoid invoking start_run from cmd line - option_group = StartRunOptionGroup() option_group.parser.set_defaults(xml_configuration=self.xml_config, scenario_name=self.scenario) # run_id, cache_directory = start_run(option_group) options, args = option_group.parse() self.run_manager = RunManager(option_group.get_services_database_configuration(options)) resources = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario) insert_auto_generated_cache_directory_if_needed(resources) cache_directory = resources["cache_directory"] self.run_manager.setup_new_run(cache_directory, resources) run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory() self.run_manager.add_row_to_history(run_id, resources, "done") if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) ## good for testing # run_id = 275 # cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20' assert run_id is not None assert cache_directory is not None return run_id, cache_directory def update_parameters( self, est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs ): i_est_v = 0 current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(cache_directory) for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: dtype = dataset[calib_attr].dtype dataset[calib_attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype) i_est_v += index.size elif type(calib_attr) in (list, tuple): for attr in calib_attr: dtype = dataset[attr].dtype dataset[attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype) i_est_v += index.size else: raise TypeError, "Unrecongized data type in calib_datasets" # dtype = dataset[calib_attr].dtype # dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype) # flush dataset dataset.flush_dataset() # i_est_v += index.size simulation_state.set_current_time(current_year) def update_prediction(self, est_v, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): option_group = RestartRunOptionGroup() option_group.parser.set_defaults(project_name=self.project_name, skip_cache_cleanup=self.skip_cache_cleanup) options, args = option_group.parse() if self.run_manager is None: self.run_manager = RunManager(option_group.get_services_database_configuration(options)) if lock != None: lock.acquire() ## query runs available for re-use runs_done = self.run_manager.get_run_info(run_ids=self.run_ids, status="done") create_baseyear_cache = False import pdb pdb.set_trace() if len(runs_done) == 0: ##there is no re-usable run directory, init a new run run_id, cache_directory = self.init_run(create_baseyear_cache=False) self.run_ids.append(run_id) create_baseyear_cache = True logger.log_status("Initializing new run with id " + str(run_id)) else: run_id = runs_done[0].run_id ##take the first 'done' run_id cache_directory = self.run_manager.get_cache_directory(run_id) logger.log_status("Using old run with id " + str(run_id)) resources = self.run_manager.get_resources_for_run_id_from_history(run_id, filter_by_status=False) self.run_manager.add_row_to_history(run_id, resources, "taken") if lock != None: lock.release() if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) self.update_parameters(est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs) restart_run(option_group=option_group, args=[run_id, self.start_year]) prediction = self.summarize_prediction(cache_directory, simulation_state, dataset_pool, calib_datasets) return prediction def summarize_prediction(self, cache_directory, simulation_state, dataset_pool, calib_datasets): dataset_name = VariableName(self.target_expression).get_dataset_name() current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.end_year) simulation_state.set_cache_directory(cache_directory) # force reload dataset_pool.remove_all_datasets() dataset = dataset_pool[dataset_name] ids = dataset.get_id_attribute() results = dataset.compute_variables(self.target_expression, dataset_pool=dataset_pool) simulation_state.set_current_time(current_year) return dict(zip(ids, results)) def read_target(self, target_file): ## read (& process) target numbers into a dictionary: {id:value} ## csv file with header ## id, target header = file(target_file, "r").readline().strip().split(",") contents = np.genfromtxt(target_file, delimiter=",", comments="#", skip_header=1) target = dict(zip(contents[:, 0], contents[:, 1])) return target def target_func(self, est_v, func=lambda x, y: np.sum(np.abs(x - y)), **kwargs): """ Target function.""" simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache ).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []}) assert ( subset is None or subset.get(dataset_name, None) is None or subset_patterns is None or subset_patterns.get(dataset_name, None) is None ) if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]]) else: index = arange(dataset.size(), dtype="i") calib_datasets[dataset_name] = [dataset, calib_attr, index] prediction = self.update_prediction(est_v, simulation_state, dataset_pool, calib_datasets, **kwargs) ## allow keys in target not appearing in prediction ## assuming their values to be 0 ### every key in target should appear in prediction # assert np.all( np.in1d(self.target.keys(), prediction.keys()) ) target = np.array(self.target.values()) predct = np.array([prediction[k] if prediction.has_key(k) else 0 for k in self.target.keys()]) results = func(predct, target) return results
number_of_runs = config.get("number_of_runs", 1) number_of_runs_in_parallel = min(config.get("parallel_runs", 1), number_of_runs) # generate seeds for multiple runs root_seed = config.get("seed", None) seed(root_seed) # generate different seed for each run (each seed contains 1 number) seed_array = randint(1,2**30, number_of_runs) list_of_cache_directories = [] for irun in range(number_of_runs): config['seed']= (seed_array[irun],) this_config = config.copy() if ((irun + 1) % number_of_runs_in_parallel) == 0: run_in_background = False else: run_in_background = True run_manager.setup_new_run(cache_directory = this_config['cache_directory'], configuration = this_config) run_manager.run_run(this_config, run_as_multiprocess=False, run_in_background=run_in_background) if irun == 0: # log file for the multiple runs will be located in the first cache first_cache_directory = this_config['cache_directory'] log_file = os.path.join(first_cache_directory, 'multiple_runs.log') logger.enable_file_logging(log_file) logger.log_status("Multiple runs: %s replications" % number_of_runs) logger.log_status("root random seed = %s" % str(root_seed)) else: logger.enable_file_logging(log_file, verbose=False) logger.log_status("Run %s: %s" % (irun+1, this_config['cache_directory'])) logger.disable_file_logging(log_file) list_of_cache_directories.append(this_config['cache_directory'])
class Calibration(object): ''' Class to calibrate UrbanSim model coefficients. ''' def __init__(self, xml_config, scenario, calib_datasets, target_expression, target_file, subset=None, subset_patterns=None, skip_cache_cleanup=False, log_directory=None): """ - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml' - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g. {'establishment_location_choice_model_coefficients': 'estimate'} - target_expression: opus expression computing values from prediction to be compared with targets - target_file: name of csv file providing targets - subset: dictionary specifying the dataset to be calibrated, {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]} subset and subset_patterns can not be both specified for the same dataset - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} subset and subset_patterns can not be both specified for the same dataset """ self.target_expression = target_expression self.target = self.read_target(target_file) self.run_manager = None self.xml_config = xml_config self.scenario = scenario self.skip_cache_cleanup = skip_cache_cleanup self.run_id, self.cache_directory = self.init_run() self.run_ids = [ self.run_id ] #allow starting of multiple runs for parallel optimization self.log_directory = log_directory if self.log_directory is None: self.log_directory = self.cache_directory #legacy log_file = os.path.join(self.log_directory, "calibration.log") logger.enable_file_logging(log_file) dict_config = XMLConfiguration(self.xml_config).get_run_configuration( self.scenario) ## get parameters from config self.base_year = dict_config['base_year'] self.start_year, self.end_year = dict_config['years'] self.project_name = dict_config['project_name'] self.package_order = dict_config[ 'dataset_pool_configuration'].package_order @log_block("Start Calibration") def run(self, optimizer='lbfgsb', results_pickle_prefix="calib", optimizer_kwargs={}): ''' Call specifized optimizer to calibrate Arguments: - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.) - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved Returns: - the results from the opimizater - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified ''' simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset( dataset_name, dataset_arguments={'id_name': []}) assert subset is None or subset.get(dataset_name, None) is None or \ subset_patterns is None or subset_patterns.get(dataset_name, None) is None if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get( dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([ True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr] ]) else: index = arange(dataset.size(), dtype='i') calib_datasets[dataset_name] = [dataset, calib_attr, index] init_v = array([], dtype='f8') for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: init_v = np.concatenate((init_v, dataset[calib_attr][index])) elif type(calib_attr) in (list, tuple): for attr in calib_attr: init_v = np.concatenate((init_v, dataset[attr][index])) else: raise TypeError, "Unrecongized data type in calib_datasets" t0 = time.time() if is_parallelizable == True: set_parallel(True) print OKBLUE + "\noptimizer = {} (is_parallel = {})".format( optimizer, is_parallelizable) + ENDC print OKBLUE + "-------------------------------------------------------\n" + ENDC if optimizer == 'bfgs': default_kwargs = { 'fprime': None, 'epsilon': 1e-08, 'maxiter': None, 'full_output': 1, 'disp': 1, 'retall': 0, 'callback': None } optimizer_func = fmin_bfgs elif optimizer == 'lbfgsb': default_kwargs = { 'fprime': None, 'approx_grad': True, 'bounds': None, 'factr': 1e12, 'iprint': 1 } optimizer_func = fmin_l_bfgs_b elif optimizer == 'anneal': default_kwargs = { 'schedule': 'fast', 'full_output': 1, 'T0': None, 'Tf': 1e-12, 'maxeval': None, 'maxaccept': None, 'maxiter': 400, 'boltzmann': 1.0, 'learn_rate': 0.5, 'feps': 1e-06, 'quench': 1.0, 'm': 1.0, 'n': 1.0, 'lower': -1, 'upper': 1, 'dwell': 50, 'disp': True } optimizer_func = anneal elif optimizer == 'panneal': default_kwargs = { 'schedule': 'fast', 'full_output': 1, 'T0': None, 'Tf': 1e-12, 'maxeval': None, 'maxaccept': None, 'maxiter': 400, 'boltzmann': 1.0, 'learn_rate': 0.5, 'feps': 1e-06, 'quench': 1.0, 'm': 1.0, 'n': 1.0, 'lower': -1, 'upper': 1, 'dwell': 50, 'disp': True, 'cores': 24, 'interv': 20 } optimizer_func = panneal else: raise ValueError, "Unrecognized optimizer {}".format(optimizer) default_kwargs.update(optimizer_kwargs) results = optimizer_func(self.target_func, copy(init_v), **default_kwargs) duration = time.time() - t0 if results_pickle_prefix is not None: pickle_file = "{}_{}.pickle".format(results_pickle_prefix, optimizer) pickle_file = os.path.join(self.log_directory, pickle_file) pickle.dump(results, open(pickle_file, "wb")) if is_parallelizable == True: set_parallel(False) logger.log_status('init target_func: {}'.format( self.target_func(init_v))) logger.log_status('end target_func: {}'.format( results[:])) #which one? logger.log_status('outputs from optimizer: {}'.format(results)) logger.log_status('Execution time: {}'.format(duration)) def init_run(self, create_baseyear_cache=True): ''' init run, get run_id & cache_directory. ''' ##avoid invoking start_run from cmd line - option_group = StartRunOptionGroup() option_group.parser.set_defaults(xml_configuration=self.xml_config, scenario_name=self.scenario) #run_id, cache_directory = start_run(option_group) options, args = option_group.parse() self.run_manager = RunManager( option_group.get_services_database_configuration(options)) resources = XMLConfiguration(self.xml_config).get_run_configuration( self.scenario) insert_auto_generated_cache_directory_if_needed(resources) cache_directory = resources['cache_directory'] self.run_manager.setup_new_run(cache_directory, resources) run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory( ) self.run_manager.add_row_to_history(run_id, resources, "done") if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) ## good for testing #run_id = 275 #cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20' assert run_id is not None assert cache_directory is not None return run_id, cache_directory def update_parameters(self, est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): i_est_v = 0 current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(cache_directory) for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: dtype = dataset[calib_attr].dtype dataset[calib_attr][index] = (est_v[i_est_v:i_est_v + index.size]).astype(dtype) i_est_v += index.size elif type(calib_attr) in (list, tuple): for attr in calib_attr: dtype = dataset[attr].dtype dataset[attr][index] = (est_v[i_est_v:i_est_v + index.size]).astype(dtype) i_est_v += index.size else: raise TypeError, "Unrecongized data type in calib_datasets" #dtype = dataset[calib_attr].dtype #dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype) #flush dataset dataset.flush_dataset() #i_est_v += index.size simulation_state.set_current_time(current_year) def update_prediction(self, est_v, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): option_group = RestartRunOptionGroup() option_group.parser.set_defaults( project_name=self.project_name, skip_cache_cleanup=self.skip_cache_cleanup) options, args = option_group.parse() if self.run_manager is None: self.run_manager = RunManager( option_group.get_services_database_configuration(options)) if lock != None: lock.acquire() ## query runs available for re-use runs_done = self.run_manager.get_run_info(run_ids=self.run_ids, status='done') create_baseyear_cache = False import pdb pdb.set_trace() if len(runs_done ) == 0: ##there is no re-usable run directory, init a new run run_id, cache_directory = self.init_run( create_baseyear_cache=False) self.run_ids.append(run_id) create_baseyear_cache = True logger.log_status('Initializing new run with id ' + str(run_id)) else: run_id = runs_done[0].run_id ##take the first 'done' run_id cache_directory = self.run_manager.get_cache_directory(run_id) logger.log_status('Using old run with id ' + str(run_id)) resources = self.run_manager.get_resources_for_run_id_from_history( run_id, filter_by_status=False) self.run_manager.add_row_to_history(run_id, resources, "taken") if lock != None: lock.release() if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) self.update_parameters(est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs) restart_run(option_group=option_group, args=[run_id, self.start_year]) prediction = self.summarize_prediction(cache_directory, simulation_state, dataset_pool, calib_datasets) return prediction def summarize_prediction(self, cache_directory, simulation_state, dataset_pool, calib_datasets): dataset_name = VariableName(self.target_expression).get_dataset_name() current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.end_year) simulation_state.set_cache_directory(cache_directory) #force reload dataset_pool.remove_all_datasets() dataset = dataset_pool[dataset_name] ids = dataset.get_id_attribute() results = dataset.compute_variables(self.target_expression, dataset_pool=dataset_pool) simulation_state.set_current_time(current_year) return dict(zip(ids, results)) def read_target(self, target_file): ## read (& process) target numbers into a dictionary: {id:value} ## csv file with header ## id, target header = file(target_file, 'r').readline().strip().split(',') contents = np.genfromtxt(target_file, delimiter=",", comments='#', skip_header=1) target = dict(zip(contents[:, 0], contents[:, 1])) return target def target_func(self, est_v, func=lambda x, y: np.sum(np.abs(x - y)), **kwargs): ''' Target function.''' simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset( dataset_name, dataset_arguments={'id_name': []}) assert subset is None or subset.get(dataset_name, None) is None or \ subset_patterns is None or subset_patterns.get(dataset_name, None) is None if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get( dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([ True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr] ]) else: index = arange(dataset.size(), dtype='i') calib_datasets[dataset_name] = [dataset, calib_attr, index] prediction = self.update_prediction(est_v, simulation_state, dataset_pool, calib_datasets, **kwargs) ## allow keys in target not appearing in prediction ## assuming their values to be 0 ### every key in target should appear in prediction #assert np.all( np.in1d(self.target.keys(), prediction.keys()) ) target = np.array(self.target.values()) predct = np.array([prediction[k] if prediction.has_key(k) else 0 \ for k in self.target.keys() ]) results = func(predct, target) return results
number_of_runs) # generate seeds for multiple runs root_seed = config.get("seed", None) seed(root_seed) # generate different seed for each run (each seed contains 1 number) seed_array = randint(1, 2**30, number_of_runs) list_of_cache_directories = [] for irun in range(number_of_runs): config['seed'] = (seed_array[irun], ) this_config = config.copy() if ((irun + 1) % number_of_runs_in_parallel) == 0: run_in_background = False else: run_in_background = True run_manager.setup_new_run( cache_directory=this_config['cache_directory'], configuration=this_config) run_manager.run_run(this_config, run_as_multiprocess=False, run_in_background=run_in_background) if irun == 0: # log file for the multiple runs will be located in the first cache first_cache_directory = this_config['cache_directory'] log_file = os.path.join(first_cache_directory, 'multiple_runs.log') logger.enable_file_logging(log_file) logger.log_status("Multiple runs: %s replications" % number_of_runs) logger.log_status("root random seed = %s" % str(root_seed)) else: logger.enable_file_logging(log_file, verbose=False)