Exemplo n.º 1
0
class Calibration(object):
    ''' Class to calibrate UrbanSim model coefficients.
    
    '''
    def __init__(self,
                 xml_config,
                 scenario,
                 calib_datasets,
                 target_expression,
                 target_file,
                 subset=None,
                 subset_patterns=None,
                 skip_cache_cleanup=False,
                 log_directory=None):
        """
        - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml'
        - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified
        - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g.
                  {'establishment_location_choice_model_coefficients': 'estimate'}
        - target_expression: opus expression computing values from prediction to be compared with targets 
        - target_file: name of csv file providing targets 
        - subset: dictionary specifying the dataset to be calibrated,
                  {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]}
          subset and subset_patterns can not be both specified for the same dataset
        - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern
                  {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} 
          subset and subset_patterns can not be both specified for the same dataset

        """
        self.target_expression = target_expression
        self.target = self.read_target(target_file)

        self.run_manager = None
        self.xml_config = xml_config
        self.scenario = scenario
        self.skip_cache_cleanup = skip_cache_cleanup
        self.run_id, self.cache_directory = self.init_run()
        self.run_ids = [
            self.run_id
        ]  #allow starting of multiple runs for parallel optimization
        self.log_directory = log_directory
        if self.log_directory is None:
            self.log_directory = self.cache_directory  #legacy

        log_file = os.path.join(self.log_directory, "calibration.log")
        logger.enable_file_logging(log_file)

        dict_config = XMLConfiguration(self.xml_config).get_run_configuration(
            self.scenario)
        ## get parameters from config
        self.base_year = dict_config['base_year']
        self.start_year, self.end_year = dict_config['years']
        self.project_name = dict_config['project_name']
        self.package_order = dict_config[
            'dataset_pool_configuration'].package_order

    @log_block("Start Calibration")
    def run(self,
            optimizer='lbfgsb',
            results_pickle_prefix="calib",
            optimizer_kwargs={}):
        ''' Call specifized optimizer to calibrate
        
        Arguments:
            - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.)
            - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved
            
        Returns:
            - the results from the opimizater
            - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified
        
        '''

        simulation_state = SimulationState()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(self.cache_directory)
        attribute_cache = AttributeCache()
        dataset_pool = SessionConfiguration(
            new_instance=True,
            package_order=self.package_order,
            in_storage=attribute_cache).get_dataset_pool()

        calib_datasets = {}
        for dataset_name, calib_attr in calib_datasets.iteritems():
            dataset = dataset_pool.get_dataset(
                dataset_name, dataset_arguments={'id_name': []})
            assert subset is None or subset.get(dataset_name, None) is None or \
                   subset_patterns is None or subset_patterns.get(dataset_name, None) is None
            if subset is not None and subset.get(dataset_name,
                                                 None) is not None:
                subset_attr, subset_cond = subset.get(dataset_name)
                index = np.in1d(dataset[subset_attr], subset_cond)
            elif subset_patterns is not None and subset_patterns.get(
                    dataset_name, None) is not None:
                subset_attr, subset_pattern = subset_patterns.get(dataset_name)
                index = array([
                    True if re.search(subset_pattern, attr_v) else False
                    for attr_v in dataset[subset_attr]
                ])
            else:
                index = arange(dataset.size(), dtype='i')

            calib_datasets[dataset_name] = [dataset, calib_attr, index]

        init_v = array([], dtype='f8')
        for dataset_name, calib in calib_datasets.iteritems():
            dataset, calib_attr, index = calib
            if type(calib_attr) == str:
                init_v = np.concatenate((init_v, dataset[calib_attr][index]))
            elif type(calib_attr) in (list, tuple):
                for attr in calib_attr:
                    init_v = np.concatenate((init_v, dataset[attr][index]))
            else:
                raise TypeError, "Unrecongized data type in calib_datasets"

        t0 = time.time()

        if is_parallelizable == True: set_parallel(True)

        print OKBLUE + "\noptimizer = {} (is_parallel = {})".format(
            optimizer, is_parallelizable) + ENDC
        print OKBLUE + "-------------------------------------------------------\n" + ENDC
        if optimizer == 'bfgs':
            default_kwargs = {
                'fprime': None,
                'epsilon': 1e-08,
                'maxiter': None,
                'full_output': 1,
                'disp': 1,
                'retall': 0,
                'callback': None
            }
            optimizer_func = fmin_bfgs
        elif optimizer == 'lbfgsb':
            default_kwargs = {
                'fprime': None,
                'approx_grad': True,
                'bounds': None,
                'factr': 1e12,
                'iprint': 1
            }

            optimizer_func = fmin_l_bfgs_b
        elif optimizer == 'anneal':
            default_kwargs = {
                'schedule': 'fast',
                'full_output': 1,
                'T0': None,
                'Tf': 1e-12,
                'maxeval': None,
                'maxaccept': None,
                'maxiter': 400,
                'boltzmann': 1.0,
                'learn_rate': 0.5,
                'feps': 1e-06,
                'quench': 1.0,
                'm': 1.0,
                'n': 1.0,
                'lower': -1,
                'upper': 1,
                'dwell': 50,
                'disp': True
            }

            optimizer_func = anneal
        elif optimizer == 'panneal':
            default_kwargs = {
                'schedule': 'fast',
                'full_output': 1,
                'T0': None,
                'Tf': 1e-12,
                'maxeval': None,
                'maxaccept': None,
                'maxiter': 400,
                'boltzmann': 1.0,
                'learn_rate': 0.5,
                'feps': 1e-06,
                'quench': 1.0,
                'm': 1.0,
                'n': 1.0,
                'lower': -1,
                'upper': 1,
                'dwell': 50,
                'disp': True,
                'cores': 24,
                'interv': 20
            }

            optimizer_func = panneal
        else:
            raise ValueError, "Unrecognized optimizer {}".format(optimizer)

        default_kwargs.update(optimizer_kwargs)
        results = optimizer_func(self.target_func, copy(init_v),
                                 **default_kwargs)

        duration = time.time() - t0
        if results_pickle_prefix is not None:
            pickle_file = "{}_{}.pickle".format(results_pickle_prefix,
                                                optimizer)
            pickle_file = os.path.join(self.log_directory, pickle_file)
            pickle.dump(results, open(pickle_file, "wb"))

        if is_parallelizable == True: set_parallel(False)

        logger.log_status('init target_func: {}'.format(
            self.target_func(init_v)))
        logger.log_status('end target_func: {}'.format(
            results[:]))  #which one?
        logger.log_status('outputs from optimizer: {}'.format(results))
        logger.log_status('Execution time: {}'.format(duration))

    def init_run(self, create_baseyear_cache=True):
        ''' init run, get run_id & cache_directory. '''
        ##avoid invoking start_run from cmd line -
        option_group = StartRunOptionGroup()
        option_group.parser.set_defaults(xml_configuration=self.xml_config,
                                         scenario_name=self.scenario)
        #run_id, cache_directory = start_run(option_group)

        options, args = option_group.parse()
        self.run_manager = RunManager(
            option_group.get_services_database_configuration(options))

        resources = XMLConfiguration(self.xml_config).get_run_configuration(
            self.scenario)
        insert_auto_generated_cache_directory_if_needed(resources)
        cache_directory = resources['cache_directory']
        self.run_manager.setup_new_run(cache_directory, resources)
        run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory(
        )
        self.run_manager.add_row_to_history(run_id, resources, "done")

        if create_baseyear_cache:
            self.run_manager.create_baseyear_cache(resources)

        ## good for testing
        #run_id = 275
        #cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20'
        assert run_id is not None
        assert cache_directory is not None
        return run_id, cache_directory

    def update_parameters(self, est_v, cache_directory, simulation_state,
                          dataset_pool, calib_datasets, *args, **kwargs):
        i_est_v = 0
        current_year = simulation_state.get_current_time()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(cache_directory)

        for dataset_name, calib in calib_datasets.iteritems():
            dataset, calib_attr, index = calib
            if type(calib_attr) == str:
                dtype = dataset[calib_attr].dtype
                dataset[calib_attr][index] = (est_v[i_est_v:i_est_v +
                                                    index.size]).astype(dtype)
                i_est_v += index.size
            elif type(calib_attr) in (list, tuple):
                for attr in calib_attr:
                    dtype = dataset[attr].dtype
                    dataset[attr][index] = (est_v[i_est_v:i_est_v +
                                                  index.size]).astype(dtype)
                    i_est_v += index.size
            else:
                raise TypeError, "Unrecongized data type in calib_datasets"

            #dtype = dataset[calib_attr].dtype
            #dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype)
            #flush dataset
            dataset.flush_dataset()
            #i_est_v += index.size
        simulation_state.set_current_time(current_year)

    def update_prediction(self, est_v, simulation_state, dataset_pool,
                          calib_datasets, *args, **kwargs):
        option_group = RestartRunOptionGroup()
        option_group.parser.set_defaults(
            project_name=self.project_name,
            skip_cache_cleanup=self.skip_cache_cleanup)

        options, args = option_group.parse()
        if self.run_manager is None:
            self.run_manager = RunManager(
                option_group.get_services_database_configuration(options))

        if lock != None: lock.acquire()
        ## query runs available for re-use
        runs_done = self.run_manager.get_run_info(run_ids=self.run_ids,
                                                  status='done')
        create_baseyear_cache = False
        import pdb
        pdb.set_trace()
        if len(runs_done
               ) == 0:  ##there is no re-usable run directory, init a new run
            run_id, cache_directory = self.init_run(
                create_baseyear_cache=False)
            self.run_ids.append(run_id)
            create_baseyear_cache = True
            logger.log_status('Initializing new run with id ' + str(run_id))
        else:
            run_id = runs_done[0].run_id  ##take the first 'done' run_id
            cache_directory = self.run_manager.get_cache_directory(run_id)
            logger.log_status('Using old run with id ' + str(run_id))
        resources = self.run_manager.get_resources_for_run_id_from_history(
            run_id, filter_by_status=False)
        self.run_manager.add_row_to_history(run_id, resources, "taken")
        if lock != None: lock.release()

        if create_baseyear_cache:
            self.run_manager.create_baseyear_cache(resources)

        self.update_parameters(est_v, cache_directory, simulation_state,
                               dataset_pool, calib_datasets, *args, **kwargs)
        restart_run(option_group=option_group, args=[run_id, self.start_year])

        prediction = self.summarize_prediction(cache_directory,
                                               simulation_state, dataset_pool,
                                               calib_datasets)
        return prediction

    def summarize_prediction(self, cache_directory, simulation_state,
                             dataset_pool, calib_datasets):
        dataset_name = VariableName(self.target_expression).get_dataset_name()
        current_year = simulation_state.get_current_time()
        simulation_state.set_current_time(self.end_year)
        simulation_state.set_cache_directory(cache_directory)
        #force reload
        dataset_pool.remove_all_datasets()
        dataset = dataset_pool[dataset_name]
        ids = dataset.get_id_attribute()
        results = dataset.compute_variables(self.target_expression,
                                            dataset_pool=dataset_pool)
        simulation_state.set_current_time(current_year)
        return dict(zip(ids, results))

    def read_target(self, target_file):
        ## read (& process) target numbers into a dictionary: {id:value}
        ## csv file with header
        ## id, target
        header = file(target_file, 'r').readline().strip().split(',')
        contents = np.genfromtxt(target_file,
                                 delimiter=",",
                                 comments='#',
                                 skip_header=1)
        target = dict(zip(contents[:, 0], contents[:, 1]))

        return target

    def target_func(self,
                    est_v,
                    func=lambda x, y: np.sum(np.abs(x - y)),
                    **kwargs):
        ''' Target function.'''

        simulation_state = SimulationState()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(self.cache_directory)
        attribute_cache = AttributeCache()
        dataset_pool = SessionConfiguration(
            new_instance=True,
            package_order=self.package_order,
            in_storage=attribute_cache).get_dataset_pool()

        calib_datasets = {}
        for dataset_name, calib_attr in calib_datasets.iteritems():
            dataset = dataset_pool.get_dataset(
                dataset_name, dataset_arguments={'id_name': []})
            assert subset is None or subset.get(dataset_name, None) is None or \
                   subset_patterns is None or subset_patterns.get(dataset_name, None) is None
            if subset is not None and subset.get(dataset_name,
                                                 None) is not None:
                subset_attr, subset_cond = subset.get(dataset_name)
                index = np.in1d(dataset[subset_attr], subset_cond)
            elif subset_patterns is not None and subset_patterns.get(
                    dataset_name, None) is not None:
                subset_attr, subset_pattern = subset_patterns.get(dataset_name)
                index = array([
                    True if re.search(subset_pattern, attr_v) else False
                    for attr_v in dataset[subset_attr]
                ])
            else:
                index = arange(dataset.size(), dtype='i')

            calib_datasets[dataset_name] = [dataset, calib_attr, index]

        prediction = self.update_prediction(est_v, simulation_state,
                                            dataset_pool, calib_datasets,
                                            **kwargs)
        ## allow keys in target not appearing in prediction
        ## assuming their values to be 0
        ### every key in target should appear in prediction
        #assert np.all( np.in1d(self.target.keys(), prediction.keys()) )
        target = np.array(self.target.values())
        predct = np.array([prediction[k] if prediction.has_key(k) else 0 \
                           for k in self.target.keys() ])
        results = func(predct, target)

        return results
class Calibration(object):
    """ Class to calibrate UrbanSim model coefficients.
    
    """

    def __init__(
        self,
        xml_config,
        scenario,
        calib_datasets,
        target_expression,
        target_file,
        subset=None,
        subset_patterns=None,
        skip_cache_cleanup=False,
        log_directory=None,
    ):
        """
        - xml_config: xml configuration file, for ex '/home/atschirhar/opus/project_configs/paris_zone.xml'
        - scenario: name of scenario to run for calibration, where models_to_run and simulation years are specified
        - calib_datasets: dictionary specifying dataset names and attributes to be calibrated, e.g.
                  {'establishment_location_choice_model_coefficients': 'estimate'}
        - target_expression: opus expression computing values from prediction to be compared with targets 
        - target_file: name of csv file providing targets 
        - subset: dictionary specifying the dataset to be calibrated,
                  {'etablishment_location_choice_model_coefficients': ['coefficient_name', ['paris_celcm, 'biotech_celcm']]}
          subset and subset_patterns can not be both specified for the same dataset
        - subset_patterns: dictionary specifying the dataset to be calibrated through a regular expression (re) pattern
                  {'etablishment_location_choice_model_coefficients': ['coefficient_name', '*_celcm']} 
          subset and subset_patterns can not be both specified for the same dataset

        """
        self.target_expression = target_expression
        self.target = self.read_target(target_file)

        self.run_manager = None
        self.xml_config = xml_config
        self.scenario = scenario
        self.skip_cache_cleanup = skip_cache_cleanup
        self.run_id, self.cache_directory = self.init_run()
        self.run_ids = [self.run_id]  # allow starting of multiple runs for parallel optimization
        self.log_directory = log_directory
        if self.log_directory is None:
            self.log_directory = self.cache_directory  # legacy

        log_file = os.path.join(self.log_directory, "calibration.log")
        logger.enable_file_logging(log_file)

        dict_config = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario)
        ## get parameters from config
        self.base_year = dict_config["base_year"]
        self.start_year, self.end_year = dict_config["years"]
        self.project_name = dict_config["project_name"]
        self.package_order = dict_config["dataset_pool_configuration"].package_order

    @log_block("Start Calibration")
    def run(self, optimizer="lbfgsb", results_pickle_prefix="calib", optimizer_kwargs={}):
        """ Call specifized optimizer to calibrate
        
        Arguments:
            - optimizer: optimization method chosen (fmin_bfgs, simulated anneal etc.)
            - results_pickle_prefix: prefix of the pickle file name that will be saved after the simulation; if None, results is not saved
            
        Returns:
            - the results from the opimizater
            - a pickle dump of the results in the cache_directory, if results_pickle_prefix is specified
        
        """

        simulation_state = SimulationState()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(self.cache_directory)
        attribute_cache = AttributeCache()
        dataset_pool = SessionConfiguration(
            new_instance=True, package_order=self.package_order, in_storage=attribute_cache
        ).get_dataset_pool()

        calib_datasets = {}
        for dataset_name, calib_attr in calib_datasets.iteritems():
            dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []})
            assert (
                subset is None
                or subset.get(dataset_name, None) is None
                or subset_patterns is None
                or subset_patterns.get(dataset_name, None) is None
            )
            if subset is not None and subset.get(dataset_name, None) is not None:
                subset_attr, subset_cond = subset.get(dataset_name)
                index = np.in1d(dataset[subset_attr], subset_cond)
            elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None:
                subset_attr, subset_pattern = subset_patterns.get(dataset_name)
                index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]])
            else:
                index = arange(dataset.size(), dtype="i")

            calib_datasets[dataset_name] = [dataset, calib_attr, index]

        init_v = array([], dtype="f8")
        for dataset_name, calib in calib_datasets.iteritems():
            dataset, calib_attr, index = calib
            if type(calib_attr) == str:
                init_v = np.concatenate((init_v, dataset[calib_attr][index]))
            elif type(calib_attr) in (list, tuple):
                for attr in calib_attr:
                    init_v = np.concatenate((init_v, dataset[attr][index]))
            else:
                raise TypeError, "Unrecongized data type in calib_datasets"

        t0 = time.time()

        if is_parallelizable == True:
            set_parallel(True)

        print OKBLUE + "\noptimizer = {} (is_parallel = {})".format(optimizer, is_parallelizable) + ENDC
        print OKBLUE + "-------------------------------------------------------\n" + ENDC
        if optimizer == "bfgs":
            default_kwargs = {
                "fprime": None,
                "epsilon": 1e-08,
                "maxiter": None,
                "full_output": 1,
                "disp": 1,
                "retall": 0,
                "callback": None,
            }
            optimizer_func = fmin_bfgs
        elif optimizer == "lbfgsb":
            default_kwargs = {"fprime": None, "approx_grad": True, "bounds": None, "factr": 1e12, "iprint": 1}

            optimizer_func = fmin_l_bfgs_b
        elif optimizer == "anneal":
            default_kwargs = {
                "schedule": "fast",
                "full_output": 1,
                "T0": None,
                "Tf": 1e-12,
                "maxeval": None,
                "maxaccept": None,
                "maxiter": 400,
                "boltzmann": 1.0,
                "learn_rate": 0.5,
                "feps": 1e-06,
                "quench": 1.0,
                "m": 1.0,
                "n": 1.0,
                "lower": -1,
                "upper": 1,
                "dwell": 50,
                "disp": True,
            }

            optimizer_func = anneal
        elif optimizer == "panneal":
            default_kwargs = {
                "schedule": "fast",
                "full_output": 1,
                "T0": None,
                "Tf": 1e-12,
                "maxeval": None,
                "maxaccept": None,
                "maxiter": 400,
                "boltzmann": 1.0,
                "learn_rate": 0.5,
                "feps": 1e-06,
                "quench": 1.0,
                "m": 1.0,
                "n": 1.0,
                "lower": -1,
                "upper": 1,
                "dwell": 50,
                "disp": True,
                "cores": 24,
                "interv": 20,
            }

            optimizer_func = panneal
        else:
            raise ValueError, "Unrecognized optimizer {}".format(optimizer)

        default_kwargs.update(optimizer_kwargs)
        results = optimizer_func(self.target_func, copy(init_v), **default_kwargs)

        duration = time.time() - t0
        if results_pickle_prefix is not None:
            pickle_file = "{}_{}.pickle".format(results_pickle_prefix, optimizer)
            pickle_file = os.path.join(self.log_directory, pickle_file)
            pickle.dump(results, open(pickle_file, "wb"))

        if is_parallelizable == True:
            set_parallel(False)

        logger.log_status("init target_func: {}".format(self.target_func(init_v)))
        logger.log_status("end target_func: {}".format(results[:]))  # which one?
        logger.log_status("outputs from optimizer: {}".format(results))
        logger.log_status("Execution time: {}".format(duration))

    def init_run(self, create_baseyear_cache=True):
        """ init run, get run_id & cache_directory. """
        ##avoid invoking start_run from cmd line -
        option_group = StartRunOptionGroup()
        option_group.parser.set_defaults(xml_configuration=self.xml_config, scenario_name=self.scenario)
        # run_id, cache_directory = start_run(option_group)

        options, args = option_group.parse()
        self.run_manager = RunManager(option_group.get_services_database_configuration(options))

        resources = XMLConfiguration(self.xml_config).get_run_configuration(self.scenario)
        insert_auto_generated_cache_directory_if_needed(resources)
        cache_directory = resources["cache_directory"]
        self.run_manager.setup_new_run(cache_directory, resources)
        run_id, cache_directory = self.run_manager.run_id, self.run_manager.get_current_cache_directory()
        self.run_manager.add_row_to_history(run_id, resources, "done")

        if create_baseyear_cache:
            self.run_manager.create_baseyear_cache(resources)

        ## good for testing
        # run_id = 275
        # cache_directory = '/home/lmwang/opus/data/paris_zone/runs/run_275.2012_05_26_00_20'
        assert run_id is not None
        assert cache_directory is not None
        return run_id, cache_directory

    def update_parameters(
        self, est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs
    ):
        i_est_v = 0
        current_year = simulation_state.get_current_time()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(cache_directory)

        for dataset_name, calib in calib_datasets.iteritems():
            dataset, calib_attr, index = calib
            if type(calib_attr) == str:
                dtype = dataset[calib_attr].dtype
                dataset[calib_attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype)
                i_est_v += index.size
            elif type(calib_attr) in (list, tuple):
                for attr in calib_attr:
                    dtype = dataset[attr].dtype
                    dataset[attr][index] = (est_v[i_est_v : i_est_v + index.size]).astype(dtype)
                    i_est_v += index.size
            else:
                raise TypeError, "Unrecongized data type in calib_datasets"

            # dtype = dataset[calib_attr].dtype
            # dataset[calib_attr][index] = (est_v[i_est_v:i_est_v+index.size]).astype(dtype)
            # flush dataset
            dataset.flush_dataset()
            # i_est_v += index.size
        simulation_state.set_current_time(current_year)

    def update_prediction(self, est_v, simulation_state, dataset_pool, calib_datasets, *args, **kwargs):
        option_group = RestartRunOptionGroup()
        option_group.parser.set_defaults(project_name=self.project_name, skip_cache_cleanup=self.skip_cache_cleanup)

        options, args = option_group.parse()
        if self.run_manager is None:
            self.run_manager = RunManager(option_group.get_services_database_configuration(options))

        if lock != None:
            lock.acquire()
        ## query runs available for re-use
        runs_done = self.run_manager.get_run_info(run_ids=self.run_ids, status="done")
        create_baseyear_cache = False
        import pdb

        pdb.set_trace()
        if len(runs_done) == 0:  ##there is no re-usable run directory, init a new run
            run_id, cache_directory = self.init_run(create_baseyear_cache=False)
            self.run_ids.append(run_id)
            create_baseyear_cache = True
            logger.log_status("Initializing new run with id " + str(run_id))
        else:
            run_id = runs_done[0].run_id  ##take the first 'done' run_id
            cache_directory = self.run_manager.get_cache_directory(run_id)
            logger.log_status("Using old run with id " + str(run_id))
        resources = self.run_manager.get_resources_for_run_id_from_history(run_id, filter_by_status=False)
        self.run_manager.add_row_to_history(run_id, resources, "taken")
        if lock != None:
            lock.release()

        if create_baseyear_cache:
            self.run_manager.create_baseyear_cache(resources)

        self.update_parameters(est_v, cache_directory, simulation_state, dataset_pool, calib_datasets, *args, **kwargs)
        restart_run(option_group=option_group, args=[run_id, self.start_year])

        prediction = self.summarize_prediction(cache_directory, simulation_state, dataset_pool, calib_datasets)
        return prediction

    def summarize_prediction(self, cache_directory, simulation_state, dataset_pool, calib_datasets):
        dataset_name = VariableName(self.target_expression).get_dataset_name()
        current_year = simulation_state.get_current_time()
        simulation_state.set_current_time(self.end_year)
        simulation_state.set_cache_directory(cache_directory)
        # force reload
        dataset_pool.remove_all_datasets()
        dataset = dataset_pool[dataset_name]
        ids = dataset.get_id_attribute()
        results = dataset.compute_variables(self.target_expression, dataset_pool=dataset_pool)
        simulation_state.set_current_time(current_year)
        return dict(zip(ids, results))

    def read_target(self, target_file):
        ## read (& process) target numbers into a dictionary: {id:value}
        ## csv file with header
        ## id, target
        header = file(target_file, "r").readline().strip().split(",")
        contents = np.genfromtxt(target_file, delimiter=",", comments="#", skip_header=1)
        target = dict(zip(contents[:, 0], contents[:, 1]))

        return target

    def target_func(self, est_v, func=lambda x, y: np.sum(np.abs(x - y)), **kwargs):
        """ Target function."""

        simulation_state = SimulationState()
        simulation_state.set_current_time(self.base_year)
        simulation_state.set_cache_directory(self.cache_directory)
        attribute_cache = AttributeCache()
        dataset_pool = SessionConfiguration(
            new_instance=True, package_order=self.package_order, in_storage=attribute_cache
        ).get_dataset_pool()

        calib_datasets = {}
        for dataset_name, calib_attr in calib_datasets.iteritems():
            dataset = dataset_pool.get_dataset(dataset_name, dataset_arguments={"id_name": []})
            assert (
                subset is None
                or subset.get(dataset_name, None) is None
                or subset_patterns is None
                or subset_patterns.get(dataset_name, None) is None
            )
            if subset is not None and subset.get(dataset_name, None) is not None:
                subset_attr, subset_cond = subset.get(dataset_name)
                index = np.in1d(dataset[subset_attr], subset_cond)
            elif subset_patterns is not None and subset_patterns.get(dataset_name, None) is not None:
                subset_attr, subset_pattern = subset_patterns.get(dataset_name)
                index = array([True if re.search(subset_pattern, attr_v) else False for attr_v in dataset[subset_attr]])
            else:
                index = arange(dataset.size(), dtype="i")

            calib_datasets[dataset_name] = [dataset, calib_attr, index]

        prediction = self.update_prediction(est_v, simulation_state, dataset_pool, calib_datasets, **kwargs)
        ## allow keys in target not appearing in prediction
        ## assuming their values to be 0
        ### every key in target should appear in prediction
        # assert np.all( np.in1d(self.target.keys(), prediction.keys()) )
        target = np.array(self.target.values())
        predct = np.array([prediction[k] if prediction.has_key(k) else 0 for k in self.target.keys()])
        results = func(predct, target)

        return results