class OptionGroup(GenericOptionGroup): def __init__(self): GenericOptionGroup.__init__(self, usage="python %prog [options] run_id [pickle_file]", description="dump resources.pickle from services db for the given run_id") self.parser.add_option("-p", "--project-name", dest="project_name", default='',help="The project name") if __name__ == "__main__": option_group = OptionGroup() parser = option_group.parser (options, args) = parser.parse_args() try: run_id = int(args[0]) except IndexError: parser.error("run_id must be provided.") parser.print_help() sys.exit(1) if len(args) == 2: pickle_file = args[1] else: pickle_file = "resources.pickle" run_manager = RunManager(option_group.get_services_database_configuration(options)) if options.project_name: run_manager.update_environment_variables(run_resources={'project_name':options.project_name}) resources = run_manager.get_resources_for_run_id_from_history(run_id=run_id) write_resources_to_file(pickle_file, resources)
class Calibration(object): ''' Class to calibrate UrbanSim model coefficients. ''' def __init__(self, xml_config, scenario, calib_datasets, target_expression, target_file, subset=None, subset_patterns=None, skip_cache_cleanup=False, log_directory=None): """ - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml' - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g. {'establishment_location_choice_model_coefficients': 'estimate'} - target_expression: opus expression computing values from prediction to be compared with targets - target_file: name of csv file providing targets - subset: dictionary specifying the dataset to be calibrated, {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]} subset and subset_patterns can not be both specified for the same dataset - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} subset and subset_patterns can not be both specified for the same dataset """ self.target_expression = target_expression self.target = self.read_target(target_file) self.run_manager = None self.xml_config = xml_config self.scenario = scenario self.skip_cache_cleanup = skip_cache_cleanup self.run_id, self.cache_directory = self.init_run() self.run_ids = [ self.run_id ] #allow starting of multiple runs for parallel optimization self.log_directory = log_directory if self.log_directory is None: self.log_directory = self.cache_directory #legacy log_file = os.path.join(self.log_directory, "calibration.log") logger.enable_file_logging(log_file) dict_config = XMLConfiguration(self.xml_config).get_run_configuration( self.scenario) ## get parameters from config self.base_year = dict_config['base_year'] self.start_year, self.end_year = dict_config['years'] self.project_name = dict_config['project_name'] self.package_order = dict_config[ 'dataset_pool_configuration'].package_order @log_block("Start Calibration") def run(self, optimizer='lbfgsb', results_pickle_prefix="calib", optimizer_kwargs={}): ''' Call specifized optimizer to calibrate Arguments: - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.) - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved Returns: - the results from the opimizater - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified ''' simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset( dataset_name, dataset_arguments={'id_name': []}) assert subset is None or subset.get(dataset_name, None) is None or \ subset_patterns is None or subset_patterns.get(dataset_name, None) is None if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get( dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([ True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr] ]) else: index = arange(dataset.size(), dtype='i') calib_datasets[dataset_name] = [dataset, calib_attr, index] init_v = array([], dtype='f8') for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: init_v = np.concatenate((init_v, dataset[calib_attr][index])) elif type(calib_attr) in (list, tuple): for attr in calib_attr: init_v = np.concatenate((init_v, dataset[attr][index])) else: raise TypeError, "Unrecongized data type in calib_datasets" t0 = time.time() if is_parallelizable == True: set_parallel(True) print OKBLUE + "\noptimizer = {} (is_parallel = {})".format( optimizer, is_parallelizable) + ENDC print OKBLUE + "-------------------------------------------------------\n" + ENDC if optimizer == 'bfgs': default_kwargs = { 'fprime': None, 'epsilon': 1e-08, 'maxiter': None, 'full_output': 1, 'disp': 1, 'retall': 0, 'callback': None } optimizer_func = fmin_bfgs elif optimizer == 'lbfgsb': default_kwargs = { 'fprime': None, 'approx_grad': True, 'bounds': None, 'factr': 1e12, 'iprint': 1 } optimizer_func = fmin_l_bfgs_b elif optimizer == 'anneal': default_kwargs = { 'schedule': 'fast', 'full_output': 1, 'T0': None, 'Tf': 1e-12, 'maxeval': None, 'maxaccept': None, 'maxiter': 400, 'boltzmann': 1.0, 'learn_rate': 0.5, 'feps': 1e-06, 'quench': 1.0, 'm': 1.0, 'n': 1.0, 'lower': -1, 'upper': 1, 'dwell': 50, 'disp': True } optimizer_func = anneal elif optimizer == 'panneal': default_kwargs = { 'schedule': 'fast', 'full_output': 1, 'T0': None, 'Tf': 1e-12, 'maxeval': None, 'maxaccept': None, 'maxiter': 400, 'boltzmann': 1.0, 'learn_rate': 0.5, 'feps': 1e-06, 'quench': 1.0, 'm': 1.0, 'n': 1.0, 'lower': -1, 'upper': 1, 'dwell': 50, 'disp': True, 'cores': 24, 'interv': 20 } optimizer_func = panneal else: raise ValueError, "Unrecognized optimizer {}".format(optimizer) default_kwargs.update(optimizer_kwargs) results = optimizer_func(self.target_func, copy(init_v), **default_kwargs) duration = time.time() - t0 if results_pickle_prefix is not None: pickle_file = "{}_{}.pickle".format(results_pickle_prefix, optimizer) pickle_file = os.path.join(self.log_directory, pickle_file) pickle.dump(results, open(pickle_file, "wb")) if is_parallelizable == True: set_parallel(False) logger.log_status('init target_func: {}'.format( self.target_func(init_v))) logger.log_status('end target_func: {}'.format( results[:])) #which one? logger.log_status('outputs from optimizer: {}'.format(results)) logger.log_status('Execution time: {}'.format(duration)) def init_run(self, create_baseyear_cache=True): ''' init run, get run_id & cache_directory. ''' ##avoid invoking start_run from cmd line - option_group = StartRunOptionGroup() option_group.parser.set_defaults(xml_configuration=self.xml_config, scenario_name=self.scenario) #run_id, cache_directory = start_run(option_group) options, args = option_group.parse() self.run_manager = RunManager( option_group.get_services_database_configuration(options)) resources = XMLConfiguration(self.xml_config).get_run_configuration( self.scenario) insert_auto_generated_cache_directory_if_needed(resources) cache_directory = resources['cache_directory'] self.run_manager.setup_new_run(cache_directory, resources) run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory( ) self.run_manager.add_row_to_history(run_id, resources, "done") if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) ## good for testing #run_id = 275 #cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20' assert run_id is not None assert cache_directory is not None return run_id, cache_directory def update_parameters(self, est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): i_est_v = 0 current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(cache_directory) for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: dtype = dataset[calib_attr].dtype dataset[calib_attr][index] = (est_v[i_est_v:i_est_v + index.size]).astype(dtype) i_est_v += index.size elif type(calib_attr) in (list, tuple): for attr in calib_attr: dtype = dataset[attr].dtype dataset[attr][index] = (est_v[i_est_v:i_est_v + index.size]).astype(dtype) i_est_v += index.size else: raise TypeError, "Unrecongized data type in calib_datasets" #dtype = dataset[calib_attr].dtype #dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype) #flush dataset dataset.flush_dataset() #i_est_v += index.size simulation_state.set_current_time(current_year) def update_prediction(self, est_v, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): option_group = RestartRunOptionGroup() option_group.parser.set_defaults( project_name=self.project_name, skip_cache_cleanup=self.skip_cache_cleanup) options, args = option_group.parse() if self.run_manager is None: self.run_manager = RunManager( option_group.get_services_database_configuration(options)) if lock != None: lock.acquire() ## query runs available for re-use runs_done = self.run_manager.get_run_info(run_ids=self.run_ids, status='done') create_baseyear_cache = False import pdb pdb.set_trace() if len(runs_done ) == 0: ##there is no re-usable run directory, init a new run run_id, cache_directory = self.init_run( create_baseyear_cache=False) self.run_ids.append(run_id) create_baseyear_cache = True logger.log_status('Initializing new run with id ' + str(run_id)) else: run_id = runs_done[0].run_id ##take the first 'done' run_id cache_directory = self.run_manager.get_cache_directory(run_id) logger.log_status('Using old run with id ' + str(run_id)) resources = self.run_manager.get_resources_for_run_id_from_history( run_id, filter_by_status=False) self.run_manager.add_row_to_history(run_id, resources, "taken") if lock != None: lock.release() if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) self.update_parameters(est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs) restart_run(option_group=option_group, args=[run_id, self.start_year]) prediction = self.summarize_prediction(cache_directory, simulation_state, dataset_pool, calib_datasets) return prediction def summarize_prediction(self, cache_directory, simulation_state, dataset_pool, calib_datasets): dataset_name = VariableName(self.target_expression).get_dataset_name() current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.end_year) simulation_state.set_cache_directory(cache_directory) #force reload dataset_pool.remove_all_datasets() dataset = dataset_pool[dataset_name] ids = dataset.get_id_attribute() results = dataset.compute_variables(self.target_expression, dataset_pool=dataset_pool) simulation_state.set_current_time(current_year) return dict(zip(ids, results)) def read_target(self, target_file): ## read (& process) target numbers into a dictionary: {id:value} ## csv file with header ## id, target header = file(target_file, 'r').readline().strip().split(',') contents = np.genfromtxt(target_file, delimiter=",", comments='#', skip_header=1) target = dict(zip(contents[:, 0], contents[:, 1])) return target def target_func(self, est_v, func=lambda x, y: np.sum(np.abs(x - y)), **kwargs): ''' Target function.''' simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset( dataset_name, dataset_arguments={'id_name': []}) assert subset is None or subset.get(dataset_name, None) is None or \ subset_patterns is None or subset_patterns.get(dataset_name, None) is None if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get( dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([ True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr] ]) else: index = arange(dataset.size(), dtype='i') calib_datasets[dataset_name] = [dataset, calib_attr, index] prediction = self.update_prediction(est_v, simulation_state, dataset_pool, calib_datasets, **kwargs) ## allow keys in target not appearing in prediction ## assuming their values to be 0 ### every key in target should appear in prediction #assert np.all( np.in1d(self.target.keys(), prediction.keys()) ) target = np.array(self.target.values()) predct = np.array([prediction[k] if prediction.has_key(k) else 0 \ for k in self.target.keys() ]) results = func(predct, target) return results
dest="project_name", default='', help="The project name") if __name__ == "__main__": option_group = OptionGroup() parser = option_group.parser (options, args) = parser.parse_args() try: run_id = int(args[0]) except IndexError: parser.error("run_id must be provided.") parser.print_help() sys.exit(1) if len(args) == 2: pickle_file = args[1] else: pickle_file = "resources.pickle" run_manager = RunManager( option_group.get_services_database_configuration(options)) if options.project_name: run_manager.update_environment_variables( run_resources={'project_name': options.project_name}) resources = run_manager.get_resources_for_run_id_from_history( run_id=run_id) write_resources_to_file(pickle_file, resources)
class Calibration(object): """ Class to calibrate UrbanSim model coefficients. """ def __init__( self, xml_config, scenario, calib_datasets, target_expression, target_file, subset=None, subset_patterns=None, skip_cache_cleanup=False, log_directory=None, ): """ - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml' - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g. {'establishment_location_choice_model_coefficients': 'estimate'} - target_expression: opus expression computing values from prediction to be compared with targets - target_file: name of csv file providing targets - subset: dictionary specifying the dataset to be calibrated, {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]} subset and subset_patterns can not be both specified for the same dataset - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} subset and subset_patterns can not be both specified for the same dataset """ self.target_expression = target_expression self.target = self.read_target(target_file) self.run_manager = None self.xml_config = xml_config self.scenario = scenario self.skip_cache_cleanup = skip_cache_cleanup self.run_id, self.cache_directory = self.init_run() self.run_ids = [self.run_id] # allow starting of multiple runs for parallel optimization self.log_directory = log_directory if self.log_directory is None: self.log_directory = self.cache_directory # legacy log_file = os.path.join(self.log_directory, "calibration.log") logger.enable_file_logging(log_file) dict_config = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario) ## get parameters from config self.base_year = dict_config["base_year"] self.start_year, self.end_year = dict_config["years"] self.project_name = dict_config["project_name"] self.package_order = dict_config["dataset_pool_configuration"].package_order @log_block("Start Calibration") def run(self, optimizer="lbfgsb", results_pickle_prefix="calib", optimizer_kwargs={}): """ Call specifized optimizer to calibrate Arguments: - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.) - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved Returns: - the results from the opimizater - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified """ simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache ).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []}) assert ( subset is None or subset.get(dataset_name, None) is None or subset_patterns is None or subset_patterns.get(dataset_name, None) is None ) if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]]) else: index = arange(dataset.size(), dtype="i") calib_datasets[dataset_name] = [dataset, calib_attr, index] init_v = array([], dtype="f8") for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: init_v = np.concatenate((init_v, dataset[calib_attr][index])) elif type(calib_attr) in (list, tuple): for attr in calib_attr: init_v = np.concatenate((init_v, dataset[attr][index])) else: raise TypeError, "Unrecongized data type in calib_datasets" t0 = time.time() if is_parallelizable == True: set_parallel(True) print OKBLUE + "\noptimizer = {} (is_parallel = {})".format(optimizer, is_parallelizable) + ENDC print OKBLUE + "-------------------------------------------------------\n" + ENDC if optimizer == "bfgs": default_kwargs = { "fprime": None, "epsilon": 1e-08, "maxiter": None, "full_output": 1, "disp": 1, "retall": 0, "callback": None, } optimizer_func = fmin_bfgs elif optimizer == "lbfgsb": default_kwargs = {"fprime": None, "approx_grad": True, "bounds": None, "factr": 1e12, "iprint": 1} optimizer_func = fmin_l_bfgs_b elif optimizer == "anneal": default_kwargs = { "schedule": "fast", "full_output": 1, "T0": None, "Tf": 1e-12, "maxeval": None, "maxaccept": None, "maxiter": 400, "boltzmann": 1.0, "learn_rate": 0.5, "feps": 1e-06, "quench": 1.0, "m": 1.0, "n": 1.0, "lower": -1, "upper": 1, "dwell": 50, "disp": True, } optimizer_func = anneal elif optimizer == "panneal": default_kwargs = { "schedule": "fast", "full_output": 1, "T0": None, "Tf": 1e-12, "maxeval": None, "maxaccept": None, "maxiter": 400, "boltzmann": 1.0, "learn_rate": 0.5, "feps": 1e-06, "quench": 1.0, "m": 1.0, "n": 1.0, "lower": -1, "upper": 1, "dwell": 50, "disp": True, "cores": 24, "interv": 20, } optimizer_func = panneal else: raise ValueError, "Unrecognized optimizer {}".format(optimizer) default_kwargs.update(optimizer_kwargs) results = optimizer_func(self.target_func, copy(init_v), **default_kwargs) duration = time.time() - t0 if results_pickle_prefix is not None: pickle_file = "{}_{}.pickle".format(results_pickle_prefix, optimizer) pickle_file = os.path.join(self.log_directory, pickle_file) pickle.dump(results, open(pickle_file, "wb")) if is_parallelizable == True: set_parallel(False) logger.log_status("init target_func: {}".format(self.target_func(init_v))) logger.log_status("end target_func: {}".format(results[:])) # which one? logger.log_status("outputs from optimizer: {}".format(results)) logger.log_status("Execution time: {}".format(duration)) def init_run(self, create_baseyear_cache=True): """ init run, get run_id & cache_directory. """ ##avoid invoking start_run from cmd line - option_group = StartRunOptionGroup() option_group.parser.set_defaults(xml_configuration=self.xml_config, scenario_name=self.scenario) # run_id, cache_directory = start_run(option_group) options, args = option_group.parse() self.run_manager = RunManager(option_group.get_services_database_configuration(options)) resources = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario) insert_auto_generated_cache_directory_if_needed(resources) cache_directory = resources["cache_directory"] self.run_manager.setup_new_run(cache_directory, resources) run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory() self.run_manager.add_row_to_history(run_id, resources, "done") if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) ## good for testing # run_id = 275 # cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20' assert run_id is not None assert cache_directory is not None return run_id, cache_directory def update_parameters( self, est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs ): i_est_v = 0 current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(cache_directory) for dataset_name, calib in calib_datasets.iteritems(): dataset, calib_attr, index = calib if type(calib_attr) == str: dtype = dataset[calib_attr].dtype dataset[calib_attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype) i_est_v += index.size elif type(calib_attr) in (list, tuple): for attr in calib_attr: dtype = dataset[attr].dtype dataset[attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype) i_est_v += index.size else: raise TypeError, "Unrecongized data type in calib_datasets" # dtype = dataset[calib_attr].dtype # dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype) # flush dataset dataset.flush_dataset() # i_est_v += index.size simulation_state.set_current_time(current_year) def update_prediction(self, est_v, simulation_state, dataset_pool, calib_datasets, *args, **kwargs): option_group = RestartRunOptionGroup() option_group.parser.set_defaults(project_name=self.project_name, skip_cache_cleanup=self.skip_cache_cleanup) options, args = option_group.parse() if self.run_manager is None: self.run_manager = RunManager(option_group.get_services_database_configuration(options)) if lock != None: lock.acquire() ## query runs available for re-use runs_done = self.run_manager.get_run_info(run_ids=self.run_ids, status="done") create_baseyear_cache = False import pdb pdb.set_trace() if len(runs_done) == 0: ##there is no re-usable run directory, init a new run run_id, cache_directory = self.init_run(create_baseyear_cache=False) self.run_ids.append(run_id) create_baseyear_cache = True logger.log_status("Initializing new run with id " + str(run_id)) else: run_id = runs_done[0].run_id ##take the first 'done' run_id cache_directory = self.run_manager.get_cache_directory(run_id) logger.log_status("Using old run with id " + str(run_id)) resources = self.run_manager.get_resources_for_run_id_from_history(run_id, filter_by_status=False) self.run_manager.add_row_to_history(run_id, resources, "taken") if lock != None: lock.release() if create_baseyear_cache: self.run_manager.create_baseyear_cache(resources) self.update_parameters(est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs) restart_run(option_group=option_group, args=[run_id, self.start_year]) prediction = self.summarize_prediction(cache_directory, simulation_state, dataset_pool, calib_datasets) return prediction def summarize_prediction(self, cache_directory, simulation_state, dataset_pool, calib_datasets): dataset_name = VariableName(self.target_expression).get_dataset_name() current_year = simulation_state.get_current_time() simulation_state.set_current_time(self.end_year) simulation_state.set_cache_directory(cache_directory) # force reload dataset_pool.remove_all_datasets() dataset = dataset_pool[dataset_name] ids = dataset.get_id_attribute() results = dataset.compute_variables(self.target_expression, dataset_pool=dataset_pool) simulation_state.set_current_time(current_year) return dict(zip(ids, results)) def read_target(self, target_file): ## read (& process) target numbers into a dictionary: {id:value} ## csv file with header ## id, target header = file(target_file, "r").readline().strip().split(",") contents = np.genfromtxt(target_file, delimiter=",", comments="#", skip_header=1) target = dict(zip(contents[:, 0], contents[:, 1])) return target def target_func(self, est_v, func=lambda x, y: np.sum(np.abs(x - y)), **kwargs): """ Target function.""" simulation_state = SimulationState() simulation_state.set_current_time(self.base_year) simulation_state.set_cache_directory(self.cache_directory) attribute_cache = AttributeCache() dataset_pool = SessionConfiguration( new_instance=True, package_order=self.package_order, in_storage=attribute_cache ).get_dataset_pool() calib_datasets = {} for dataset_name, calib_attr in calib_datasets.iteritems(): dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []}) assert ( subset is None or subset.get(dataset_name, None) is None or subset_patterns is None or subset_patterns.get(dataset_name, None) is None ) if subset is not None and subset.get(dataset_name, None) is not None: subset_attr, subset_cond = subset.get(dataset_name) index = np.in1d(dataset[subset_attr], subset_cond) elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None: subset_attr, subset_pattern = subset_patterns.get(dataset_name) index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]]) else: index = arange(dataset.size(), dtype="i") calib_datasets[dataset_name] = [dataset, calib_attr, index] prediction = self.update_prediction(est_v, simulation_state, dataset_pool, calib_datasets, **kwargs) ## allow keys in target not appearing in prediction ## assuming their values to be 0 ### every key in target should appear in prediction # assert np.all( np.in1d(self.target.keys(), prediction.keys()) ) target = np.array(self.target.values()) predct = np.array([prediction[k] if prediction.has_key(k) else 0 for k in self.target.keys()]) results = func(predct, target) return results