def getLatinHypercubeDraws(sampleSize, numberOfDraws, symmetric=False, uniformNumbers=None): """ Implementation of the Modified Latin Hypercube Sampling proposed by Hess et al, 2006. :param sampleSize: number of observations for which draws must be generated. If None, a one dimensional array will be generated. If it has a values k, then k series of draws will be generated :type sampleSize: int :param numberOfDraws: number of draws to generate. :type numberOfDraws: int :param symmetric: if True, draws from [-1: 1] are generated. If False, draws from [0: 1] are generated. Default: False :type symmetric: bool :param uniformNumbers: numpy with uniformly distributed numbers. If None, the numpy uniform number generator is used. :type uniformNumbers: numpy.array :return: numpy array with the draws :rtype: numpy.array Example:: latinHypercube = dr.getLatinHypercubeDraws(sampleSize=3, numberOfDraws=10) array([[0.43362897, 0.5275741 , 0.09215663, 0.94056236, 0.34376868, 0.87195551, 0.41495219, 0.71736691, 0.23198736, 0.145561 ], [0.30520544, 0.78082964, 0.83591146, 0.2733167 , 0.53890906, 0.61607469, 0.00699715, 0.17179441, 0.7557228 , 0.39733102], [0.49676864, 0.67073483, 0.9788854 , 0.5726069 , 0.11894558, 0.05515471, 0.2640275 , 0.82093696, 0.92034628, 0.64866597]]) """ if numberOfDraws <= 0: raise excep.biogemeError(f'Invalid number of draws: {numberOfDraws}.') if sampleSize <= 0: raise excep.biogemeError( f'Invalid sample size: {sampleSize} when generating draws.') totalSize = numberOfDraws * sampleSize if uniformNumbers is None: uniformNumbers = np.random.uniform(size=totalSize) else: if uniformNumbers.size != totalSize: errorMsg = (f'A total of {totalSize} uniform draws ' f'must be provided, and not {uniformNumbers.size}.') raise excep.biogemeError(errorMsg) uniformNumbers.shape = (totalSize, ) numbers = np.array([(float(i) + uniformNumbers[i]) / float(totalSize) for i in range(totalSize)]) if symmetric: numbers = 2.0 * numbers - 1.0 np.random.shuffle(numbers) numbers.shape = (sampleSize, numberOfDraws) return numbers
def sampleWithoutReplacement(self, samplingRate, columnWithSamplingWeights=None): """ Replace the data set by a sample for stochastic algorithms :param samplingRate: the proportion of data to include in the sample. :type samplingRate: float :param columnWithSamplingWeights: name of the column with the sampling weights. If None, each row has equal probability. :param columnWithSamplingWeights: string :return: None """ if self.isPanel(): if self.fullIndividualMap is None: self.fullIndividualMap = self.individualMap else: # Check if the structure has not been modified since last sample if set(self.fullIndividualMap.columns) != set(self.individualMap.columns): message = 'The structure of the database has been modified since last sample. ' left = set(self.fullIndividualMap.columns).\ difference(set(self.individualMap.columns)) if left: message += f' Columns that disappeared: {left}' right = set(self.individualMap.columns).\ difference(set(self.fullIndividualMap.columns)) if right: message += f' Columns that were added: {right}' raise excep.biogemeError(message) self.individualMap = \ self.fullIndividualMap.sample(frac=samplingRate, weights=columnWithSamplingWeights) theMsg = (f'Full data: {self.fullIndividualMap.shape} ' f'Sampled data: {self.individualMap.shape}') self.logger.debug(theMsg) else: # Cross sectional data if self.fullData is None: self.fullData = self.data else: # Check if the structure has not been modified since last sample if set(self.fullData.columns) != set(self.data.columns): message = 'The structure of the database has been modified since last sample. ' left = set(self.fullData.columns).difference(set(self.data.columns)) if left: message += f' Columns that disappeared: {left}' right = set(self.data.columns).difference(set(self.fullData.columns)) if right: message += f' Columns that were added: {right}' raise excep.biogemeError(message) self.data = self.fullData.sample(frac=samplingRate, weights=columnWithSamplingWeights) self.logger.debug(f'Full data: {self.fullData.shape} Sampled data: {self.data.shape}')
def simulate(self, theBetaValues=None): """Applies the formulas to each row of the database. :param theBetaValues: values of the parameters to be used in the calculations. If None, the default values are used. Default: None. :type theBetaValues: dict(str, float) :return: a pandas data frame with the simulated value. Each row corresponds to a row in the database, and each column to a formula. :rtype: Pandas data frame Example:: # Read the estimation results from a file results = res.bioResults(pickleFile = 'myModel.pickle') # Simulate the formulas using the nominal values simulatedValues = biogeme.simulate(betaValues) :raises biogemeError: if the number of parameters is incorrect """ if self.database.isPanel(): error_msg = ('Simulation for panel data is not yet' ' implemented. Remove the "panel" ' 'statement to simulate each observation.') raise excep.biogemeError(error_msg) if theBetaValues is None: betaValues = self.betaInitValues else: if not isinstance(theBetaValues, dict): err = (f'Deprecated. A dictionary must be provided. ' f'It can be obtained from results.getBetaValues()') raise excep.biogemeError(err) else: betaValues = list() for i in range(len(self.freeBetaNames)): x = self.freeBetaNames[i] if x in theBetaValues: betaValues.append(theBetaValues[x]) else: betaValues.append(self.betaInitValues[i]) output = pd.DataFrame(index=self.database.data.index) for k, v in self.formulas.items(): signature = v.getSignature() result = self.theC.simulateFormula(signature, betaValues, self.fixedBetaValues, self.database.data) output[k] = result return output
def useFullSample(self): """ Re-establish the full sample for calculation of the likelihood """ if self.isPanel(): if self.fullIndividualMap is None: raise excep.biogemeError('Full panel data set has not been saved.') self.individualMap = self.fullIndividualMap else: if self.fullData is None: raise excep.biogemeError('Full data set has not been saved.') self.data = self.fullData
def getUniform(sampleSize, numberOfDraws, symmetric=False): """ Uniform [0, 1] or [-1, 1] numbers :param sampleSize: number of observations for which draws must be generated. If None, a one dimensional array will be generated. If it has a values k, then k series of draws will be generated :type sampleSize: int :param numberOfDraws: number of draws to generate. :type numberOfDraws: int :param symmetric: if True, draws from [-1: 1] are generated. If False, draws from [0: 1] are generated. Default: False :type symmetric: bool :return: numpy array with the draws :rtype: numpy.array Example:: draws = dr.getUniform(sampleSize=3, numberOfDraws=10, symmetric=False) array([[0.13053817, 0.63892308, 0.55031567, 0.26347854, 0.16730932, 0.77745367, 0.48283887, 0.84247501, 0.20550219, 0.02373537], [0.68935846, 0.03363595, 0.36006669, 0.26709364, 0.54907706, 0.22492104, 0.2494399 , 0.17323209, 0.52370401, 0.54091257], [0.40310204, 0.89916711, 0.86065005, 0.94277699, 0.09077065, 0.40107731, 0.22554722, 0.47693135, 0.14058265, 0.17397031]]) draws = dr.getUniform(sampleSize=3, numberOfDraws=10, symmetric=True) array([[ 0.74403237, -0.27995692, 0.33997421, -0.89405035, -0.129761 , 0.86593325, 0.30657422, 0.82435619, 0.498482 , 0.24561616], [-0.48239607, -0.29257815, -0.98342034, 0.68392813, -0.25379429, 0.49359859, -0.26459883, 0.14569724, -0.68860467, -0.40903446], [ 0.93251627, -0.85166912, 0.58096917, 0.39289882, -0.65088635, 0.40114744, -0.61327161, 0.08900539, -0.20985417, 0.67542226]]) """ if numberOfDraws <= 0: raise excep.biogemeError(f'Invalid number of draws: {numberOfDraws}.') if sampleSize <= 0: raise excep.biogemeError( f'Invalid sample size: {sampleSize} when generating draws.') totalSize = numberOfDraws * sampleSize uniformNumbers = np.random.uniform(size=totalSize) if symmetric: uniformNumbers = 2.0 * uniformNumbers - 1.0 uniformNumbers.shape = (sampleSize, numberOfDraws) return uniformNumbers
def f(self, batch=None): if batch is not None: raise excep.biogemeError('This function is not data driven.') n = len(self.x) f = sum(100.0 * (self.x[i + 1] - self.x[i]**2)**2 + (1.0 - self.x[i])**2 for i in range(n - 1)) return f
def add(self, f, g, h, batch, discount=0.95): if g is not None: if self.n is None: self.n = len(g) elif len(g) != self.n: raise excep.biogemeError( f'Incompatible dimensions {len(g)} and {self.n}') if h is not None: if h.shape != (self.n, self.n): raise excep.biogemeError( f'Incompatible dimensions {h.shape} and ({self.n},{self.n})' ) if batch <= 0.0 or batch > 1.0: raise excep.biogemeError( f'Batch size must be between 0 and 1: {batch}') self.f += [f] self.g += [g] self.h += [h] self.batch += [batch] return self.f_g_h(discount)
def validate(self, estimationResults, slices=5): """Perform out-of-sample validation. The function performs the following tasks: - it shuffles the data set, - it splits the data set into slices of (approximatively) the same size, - each slice defines a validation set (the slice itself) and an estimation set (the rest of the data), - the model is re-estimated on the estimation set, - the estimated model is applied on the validation set, - the value of the log likelihood for each observation is reported. :param estimationResults: results of the model estimation based on the full data. :type estimationResults: biogeme.results.bioResults :param slices: number of slices. :type slices: int :return: a list containing as many items as slices. Each item is the result of the simulation on the validation set. :rtype: list(pandas.DataFrame) """ if self.database.isPanel(): raise excep.biogemeError( 'Validation for panel data is not yet implemented') # Split the database validationData = self.database.split(slices) keepDatabase = self.database allSimulationResults = [] for v in validationData: # v[0] is the estimation data set self.database = db.Database('Estimation data', v[0]) self.loglike.changeInitValues(estimationResults.getBetaValues()) results = self.estimate() simulate = {'Loglikelihood': self.loglike} simBiogeme = BIOGEME(db.Database('Validation data', v[1]), simulate) simResult = simBiogeme.simulate(results.getBetaValues()) allSimulationResults.append(simResult) self.database = keepDatabase if self.generatePickle: fname = f'{self.modelName}_validation' pickleFileName = bf.getNewFileName(fname, 'pickle') with open(pickleFileName, 'wb') as f: pickle.dump(allSimulationResults, f) self.logger.general( f'Simulation results saved in file {pickleFileName}') return allSimulationResults
def getBoundsOnBeta(self, betaName): """ Returns the bounds on the parameter as defined by the user. :param betaName: name of the parameter :type betaName: string :return: lower bound, upper bound :rtype: tuple :raises biogemeError: if the name of the parameter is not found. """ if betaName not in self.freeBetaNames: raise excep.biogemeError(f'Unknown parameter {betaName}') index = self.freeBetaNames.index(betaName) return self.bounds[index]
def f(self, batch=None): if self.x is None: raise excep.biogemeError('The variables must be set first.') if batch is not None or self.batch is not None: self.batch = batch self.recalculate = True if self.fv is None: self.recalculate = True if self.recalculate: self.fv = self.like(self.x, self.scaled, self.batch) self.gv = None self.hv = None self.bhhhv = None return -self.fv
def f_g_bhhh(self, batch=None): if batch is not None or self.batch is not None: self.batch = batch self.recalculate = True if self.x is None: raise excep.biogemeError('The variables must be set first.') if self.fv is None or self.gv is None or self.bhhhv is None: self.recalculate = True if self.recalculate: self.fv, self.gv, _, self.bhhhv = self.like_deriv(self.x, self.scaled, hessian=False, bhhh=True, batch=batch) self.hv = None return (-self.fv, -self.gv, -self.bhhhv)
def _audit(self): """Each expression provides an audit function, that verifies its validity. Each formula is audited, and the list of errors and warnings reported. :raise biogemeError: if the formula has issues, an error is detected and an exception is raised. """ listOfErrors = [] listOfWarnings = [] for k, v in self.formulas.items(): err, war = v.audit(self.database) listOfErrors += err listOfWarnings += war if listOfWarnings: self.logger.warning('\n'.join(listOfWarnings)) if listOfErrors: self.logger.warning('\n'.join(listOfErrors)) raise excep.biogemeError('\n'.join(listOfErrors))
def sampleIndividualMapWithReplacement(self, size=None): """ Extract a random sample of the individual map from a panel data database, with replacement. Useful for bootstrapping. :param size: size of the sample. If None, a sample of the same size as the database will be generated. Default: None. :type size: int :return: pandas dataframe with the sample. :rtype: pandas.DataFrame """ if not self.isPanel(): errorMsg = ('Function sampleIndividualMapWithReplacement' ' is available only on panel data.') raise excep.biogemeError(errorMsg) if size is None: size = len(self.individualMap) sample = self.individualMap.iloc[np.random.randint(0, len(self.individualMap), size=size)] return sample
def panel(self, columnName): """ Defines the data as panel data :param columnName: name of the columns that identifies individuals. :type columnName: string """ self.panelColumn = columnName # Check if the data is organized in consecutive entries # Number of groups of data nGroups = tools.countNumberOfGroups(self.data, self.panelColumn) sortedData = self.data.sort_values(by=[self.panelColumn]) nIndividuals = tools.countNumberOfGroups(sortedData, self.panelColumn) if nGroups != nIndividuals: theError = (f'The data must be sorted so that the data' f' for the same individual are consecutive.' f' There are {nIndividuals} individuals ' f'in the sample, and {nGroups} groups of ' f'data for column {self.panelColumn}.') raise excep.biogemeError(theError) self.buildPanelMap()
def getHaltonDraws(sampleSize, numberOfDraws, symmetric=False, base=2, skip=0, shuffled=False): """ Generate Halton draws :param sampleSize: number of observations for which draws must be generated. If None, a one dimensional array will be generated. If it has a values k, then k series of draws will be generated :type sampleSize: int :param numberOfDraws: number of draws to generate. :type numberOfDraws: int :param symmetric: if True, draws from [-1: 1] are generated. If False, draws from [0: 1] are generated. Default: False :type symmetric: bool :param base: generate Halton draws for a given basis. Ideally, it should be a prime number. Default: 2. :type base: int :param skip: the number of elements of the sequence to be discarded. :type skip: int :param shuffled: if True, each series is shuffled :type shuffled: bool :return: numpy array with the draws :rtype: numpy.array Example:: halton = dr.getHaltonDraws(sampleSize=2, numberOfDraws=10, base=3) array([[0.33333333, 0.66666667, 0.11111111, 0.44444444, 0.77777778, 0.22222222, 0.55555556, 0.88888889, 0.03703704, 0.37037037], [0.7037037 , 0.14814815, 0.48148148, 0.81481481, 0.25925926, 0.59259259, 0.92592593, 0.07407407, 0.40740741, 0.74074074]]) """ if numberOfDraws <= 0: raise excep.biogemeError(f'Invalid number of draws: {numberOfDraws}.') if sampleSize <= 0: raise excep.biogemeError( f'Invalid sample size: {sampleSize} when generating draws.') totalSize = numberOfDraws * sampleSize numbers = [] skipped = 0 for i in range(totalSize + 1 + skip): n, denom = 0., 1. while i > 0: i, remainder = divmod(i, base) denom *= base n += remainder / denom if skipped < skip: skipped += 1 else: numbers.append(n) numbers = np.array(numbers[1:]) if shuffled: np.random.shuffle(numbers) if symmetric: numbers = 2.0 * numbers - 1.0 numbers.shape = (sampleSize, numberOfDraws) return numbers
def f_g_bhhh(self, batch=None): raise excep.biogemeError('This function is not data driven.')
def f_g_h(self, batch=None): if batch is not None: raise excep.biogemeError('This function is not data driven.') return self.f(), self.g(), self.h()
def __init__(self, database, formulas, userNotes=None, numberOfThreads=None, numberOfDraws=1000, seed=None, skipAudit=False, removeUnusedVariables=True, suggestScales=True, missingData=99999): """Constructor :param database: choice data. :type database: biogeme.database :param formulas: expression or dictionary of expressions that define the model specification. The concept is that each expression is applied to each entry of the database. The keys of the dictionary allow to provide a name to each formula. In the estimation mode, two formulas are needed, with the keys 'loglike' and 'weight'. If only one formula is provided, it is associated with the label 'loglike'. If no formula is labeled 'weight', the weight of each piece of data is supposed to be 1.0. In the simulation mode, the labels of each formula are used as labels of the resulting database. :type formulas: biogeme.expressions.Expression, or dict(biogeme.expressions.Expression) :param userNotes: these notes will be included in the report file. :type userNotes: str :param numberOfThreads: multi-threading can be used for estimation. This parameter defines the number of threads to be used. If the parameter is set to None, the number of available threads is calculated using cpu_count(). Ignored in simulation mode. Defaults: None. :type numberOfThreads: int :param numberOfDraws: number of draws used for Monte-Carlo integration. Default: 1000. :type numberOfDraws: int :param seed: seed used for the pseudo-random number generation. It is useful only when each run should generate the exact same result. If None, a new seed is used at each run. Default: None. :type seed: int :param skipAudit: if True, does not check the validity of the formulas. It may save significant amount of time for large models and large data sets. Default: False. :type skipAudit: bool :param removeUnusedVariables: if True, all variables not used in the expression are removed from the database. Default: True. :type removeUnusedVariables: bool :param suggestScales: if True, Biogeme suggests the scaling of the variables in the database. Default: True. See also :func:`biogeme.database.Database.suggestScaling` :type suggestScales: bool. :param missingData: if one variable has this value, it is assumed that a data is missing and an exception will be triggered. Default: 99999. :type missingData: float """ ## Logger that controls the output of messages to the screen and log file. self.logger = logger if not skipAudit: database.data = database.data.replace({True: 1, False: 0}) listOfErrors, listOfWarnings = database._audit() if listOfWarnings: self.logger.warning('\n'.join(listOfWarnings)) if listOfErrors: self.logger.warning('\n'.join(listOfErrors)) raise excep.biogemeError('\n'.join(listOfErrors)) ## Keyword used for the name of the loglikelihood formula. Default: 'loglike' self.loglikeName = 'loglike' ## Keyword used for the name of the weight formula. Default: 'weight' self.weightName = 'weight' ## Name of the model. Default: 'biogemeModelDefaultName' self.modelName = 'biogemeModelDefaultName' ## monteCarlo is True if one of the expression involves a # Monte-Carlo integration. self.monteCarlo = False np.random.seed(seed) ## If True, the values are saved on a file each time the likelihood function is calculated self.saveIterations = False if not isinstance(formulas, dict): ## Object of type biogeme.expressions.Expression ## calculating the formula for the loglikelihood self.loglike = formulas ## Object of type biogeme.expressions.Expression ## calculating the weight of each observation in the ## sample self.weight = None ## Dictionary containing Biogeme formulas of type ## biogeme.expressions.Expression. # The keys are the names of the formulas. self.formulas = dict({self.loglikeName: formulas}) else: self.loglike = formulas.get(self.loglikeName) self.weight = formulas.get(self.weightName) self.formulas = formulas ## biogeme.database object self.database = database ## User notes self.userNotes = userNotes ## Missing data self.missingData = missingData ## keep track of the sample of data used to calculate the ## stochastic gradient / hessian self.lastSample = None ## Init value of the likelihood function self.initLogLike = None self.usedVariables = set() for k, f in self.formulas.items(): self.usedVariables = self.usedVariables.union(f.setOfVariables()) if self.database.isPanel(): self.usedVariables.add(self.database.panelColumn) if removeUnusedVariables: unusedVariables = set( self.database.data.columns) - self.usedVariables error_msg = (f'Remove {len(unusedVariables)} ' 'unused variables from the database ' f'as only {len(self.usedVariables)} are used.') self.logger.general(error_msg) self.database.data = \ self.database.data.drop(columns=list(unusedVariables)) if suggestScales: suggestedScales = self.database.suggestScaling( columns=self.usedVariables) if not suggestedScales.empty: logger.detailed( 'It is suggested to scale the following variables.') for index, row in suggestedScales.iterrows(): error_msg = ( f'Multiply {row["Column"]} by\t{row["Scale"]} ' 'because the largest (abs) value is\t' f'{row["Largest"]}') logger.detailed(error_msg) error_msg = ('To remove this feature, set the parameter ' 'suggestScales to False when creating the ' 'BIOGEME object.') logger.detailed(error_msg) if not skipAudit: self._audit() self.theC = cb.pyBiogeme() self._prepareDatabaseForFormula() self._prepareLiterals() ## Boolean variable, True if the HTML file with the results must be generated. self.generateHtml = True ## Boolean variable, True if the pickle file with the results must be generated. self.generatePickle = True ## Name of the column defining weights for batch sampling in ## stochastic optimization. self.columnForBatchSamplingWeights = None ## Number of threads used for parallel computing. Default: the number of CPU available. self.numberOfThreads = mp.cpu_count( ) if numberOfThreads is None else numberOfThreads start_time = datetime.now() self._generateDraws(numberOfDraws) if self.monteCarlo: self.theC.setDraws(self.database.theDraws) ## Time needed to generate the draws. self.drawsProcessingTime = datetime.now() - start_time if self.loglike is not None: ## Internal signature of the formula for the loglikelihood self.loglikeSignatures = self.loglike.getSignature() if self.weight is None: self.theC.setExpressions(self.loglikeSignatures, self.numberOfThreads) else: ## Internal signature of the formula for the weight self.weightSignatures = self.weight.getSignature() self.theC.setExpressions(self.loglikeSignatures, self.numberOfThreads, self.weightSignatures) ## Time needed to calculate the bootstrap standard errors self.bootstrap_time = None ## Results of the bootstrap calculation. self.bootstrap_results = None ## Information provided by the optimization algorithm after completion. self.optimizationMessages = None ## Name of the File where intermediate iterations are stotred self.file_iterations = None ## Default bounds, replacing None, for the CFSQP algorithm self.cfsqp_default_bounds = 1000 ## Parameters to be transferred to the optimization algorithm self.algoParameters = None ## Optimization algorithm self.algorithm = None ## Store the best iteration found so far. self.bestIteration = None
def generateDraws(self, types, names, numberOfDraws): """Generate draws for each variable. :param types: A dict indexed by the names of the variables, describing the types of draws. Each of them can be a native type or any type defined by the function database.setRandomNumberGenerators :type types: dict :param names: the list of names of the variables that require draws to be generated. :type names: list of strings :param numberOfDraws: number of draws to generate. :type numberOfDraws: int :return: a 3-dimensional table with draws. The 3 dimensions are 1. number of individuals 2. number of draws 3. number of variables :rtype: numpy.array Example:: types = {'randomDraws1': 'NORMAL_MLHS_ANTI', 'randomDraws2': 'UNIFORM_MLHS_ANTI', 'randomDraws3': 'UNIFORMSYM_MLHS_ANTI'} theDrawsTable = myData.generateDraws(types, ['randomDraws1', 'randomDraws2', 'randomDraws3'], 10) """ self.numberOfDraws = numberOfDraws # Dimensions of the draw table: # 1. number of variables # 2. number of individuals # 3. number of draws listOfDraws = [None]*len(names) for i, v in enumerate(names): name = v drawType = types[name] self.typesOfDraws[name] = drawType theGenerator = self.nativeRandomNumberGenerators.get(drawType) if theGenerator is None: theGenerator = self.userRandomNumberGenerators.get(drawType) if theGenerator is None: native = self.nativeRandomNumberGenerators user = self.userRandomNumberGenerators errorMsg = (f'Unknown type of draws for ' f'variable {name}: {drawType}. ' f'Native types: {native}. ' f'User defined: {user}') raise excep.biogemeError(errorMsg) listOfDraws[i] = theGenerator[0](self.getSampleSize(), numberOfDraws) if listOfDraws[i].shape != (self.getSampleSize(), numberOfDraws): errorMsg = (f'The draw generator for {name} must' f' generate a numpy array of dimensions' f' ({self.getSampleSize()}, {numberOfDraws})' f' instead of {listOfDraws[i].shape}') raise excep.biogemeError(errorMsg) self.theDraws = np.array(listOfDraws) ## Draws as a three-dimensional numpy series. The dimensions are organized to be more # suited for calculation. # 1. number of individuals # 2. number of draws # 3. number of variables self.theDraws = np.moveaxis(self.theDraws, 0, -1) return self.theDraws
def calculateLikelihoodAndDerivatives(self, x, scaled, hessian=False, bhhh=False, batch=None): """Calculate the value of the log likelihood function and its derivatives. :param x: vector of values for the parameters. :type x: list(float) :param hessian: if True, the hessian is calculated. Default: False. :type hessian: bool :param bhhh: if True, the BHHH matrix is calculated. Default: False. :type bhhh: bool :param batch: if not None, calculates the likelihood on a random sample of the data. The value of the parameter must be strictly between 0 and 1, and represents the share of the data that will be used. Default: None :type batch: float :return: f, g, h, bh where - f is the value of the function (float) - g is the gradient (numpy.array) - h is the hessian (numpy.array) - bh is the BHHH matrix (numpy.array) :rtype: tuple float, numpy.array, numpy.array, numpy.array :raises ValueError: if the length of the list x is incorrect """ if len(x) != len(self.betaInitValues): error_msg = (f'Input vector must be of length ' f'{len(self.betaInitValues)} and not {len(x)}') raise ValueError(error_msg) self._prepareDatabaseForFormula(batch) f, g, h, bh = self.theC.calculateLikelihoodAndDerivatives( x, self.fixedBetaValues, self.betaIds, hessian, bhhh) if len(self.freeBetaNames) <= 30: for i in range(len(self.freeBetaNames)): self.logger.debug(f'{self.freeBetaNames[i]}: {x[i]:10.7g}') hmsg = '' if hessian: hmsg = f'Hessian norm: {np.linalg.norm(h):10.1g}' bhhhmsg = '' if bhhh: bhhhmsg = f'BHHH norm: {np.linalg.norm(bh):10.1g}' self.logger.general( f'Log likelihood (N = {self.database.getSampleSize()}): {f:10.7g}' f' Gradient norm: {np.linalg.norm(g):10.1g}' f' {hmsg} {bhhhmsg}') if self.saveIterations: if self.bestIteration is None: self.bestIteration = f if f >= self.bestIteration: with open(self.file_iterations, 'w') as pf: for i, v in enumerate(x): print(f'{self.freeBetaNames[i]} = {v}', file=pf) if scaled: N = float(self.database.getSampleSize()) if N == 0: raise excep.biogemeError(f'Sample size is {N}') return f / N, np.asarray(g) / N, np.asarray(h) / N, np.asarray( bh) / N return f, np.asarray(g), np.asarray(h), np.asarray(bh)
def estimate(self, bootstrap=0, algorithm=opt.simpleBoundsNewtonAlgorithmForBiogeme, algoParameters=None, cfsqp_default_bounds=1000.0, saveIterations=False, file_iterations='__savedIterations.txt'): """Estimate the parameters of the model. :param bootstrap: number of bootstrap resampling used to calculate the variance-covariance matrix using bootstrapping. If the number is 0, bootstrapping is not applied. Default: 0. :type bootstrap: int :param algorithm: optimization algorithm to use for the maximum likelihood estimation. If None, cfsqp is . Default: Biogeme's Newton's algorithm with simple bounds. :type algorithm: function :param algoParameters: parameters to transfer to the optimization algorithm :type algoParameters: dict :param cfsqp_default_bounds: if the user does not provide bounds on the parameters, CFSQP assumes that the bounds are [-cfsqp_default_bounds, cfsqp_default_bounds] :type cfsqp_default_bounds: float :param saveIterations: if True, the values of the parameters corresponding to the largest value of the likelihood function are saved in a pickle file at each iteration of the algorithm. Default: False. :type saveIterations: bool :param file_iterations: name of the file where to save the values of the parameters. Default: '__savedIterations.txt' :type file_iterations: str :return: object containing the estimation results. :rtype: biogeme.bioResults Example:: # Create an instance of biogeme biogeme = bio.BIOGEME(database, logprob) # Gives a name to the model biogeme.modelName = 'mymodel' # Estimate the parameters results = biogeme.estimate() :raises biogemeError: if no expression has been provided for the likelihood """ if self.loglike is None: raise excep.biogemeError( 'No log likelihood function has been specificed') if len(self.freeBetaNames) == 0: raise excep.biogemeError(f'There is no parameter to estimate' f' in the formula: {self.loglike}.') self.algorithm = algorithm self.algoParameters = algoParameters self.cfsqp_default_bounds = cfsqp_default_bounds self.calculateInitLikelihood() self.saveIterations = saveIterations self.file_iterations = f'{file_iterations}' self.bestIteration = None start_time = datetime.now() # yep.start('profile.out') # yep.stop() output = self.optimize(self.betaInitValues) xstar, optimizationMessages = output ## Running time of the optimization algorithm optimizationMessages['Optimization time'] = datetime.now() - start_time ## Information provided by the optimization algorithm after completion. self.optimizationMessages = optimizationMessages fgHb = self.calculateLikelihoodAndDerivatives(xstar, scaled=False, hessian=True, bhhh=True) if not np.isfinite(fgHb[2]).all(): warning_msg = ('Numerical problems in calculating ' 'the analytical hessian. Finite differences' ' is tried instead.') self.logger.warning(warning_msg) finDiffHessian = self.likelihoodFiniteDifferenceHessian(xstar) if not np.isfinite(fgHb[2]).all(): self.logger.warning( 'Numerical problems with finite difference hessian as well.' ) else: fgHb = fgHb[0], fgHb[1], finDiffHessian, fgHb[3] ## numpy array, of size B x K, # where # - B is the number of bootstrap iterations # - K is the number pf parameters to estimate self.bootstrap_results = None if bootstrap > 0: start_time = datetime.now() self.logger.general( f'Re-estimate the model {bootstrap} times for bootstrapping') self.bootstrap_results = np.empty(shape=[bootstrap, len(xstar)]) self.logger.temporarySilence() for b in range(bootstrap): if self.database.isPanel(): sample = self.database.sampleIndividualMapWithReplacement() self.theC.setDataMap(sample) else: sample = self.database.sampleWithReplacement() self.theC.setData(sample) x_br, _ = self.optimize(xstar) self.bootstrap_results[b] = x_br ## Time needed to generate the bootstrap results self.bootstrap_time = datetime.now() - start_time self.logger.resume() rawResults = res.rawResults(self, xstar, fgHb, bootstrap=self.bootstrap_results) r = res.bioResults(rawResults) if self.generateHtml: r.writeHtml() if self.generatePickle: r.writePickle() return r
def quickEstimate(self, algorithm=opt.simpleBoundsNewtonAlgorithmForBiogeme, algoParameters=None): """Estimate the parameters of the model. Same as estimate, where any extra calculation is skipped (init loglikelihood, t-statistics, etc.) :param algorithm: optimization algorithm to use for the maximum likelihood estimation. If None, cfsqp is . Default: Biogeme's Newton's algorithm with simple bounds. :type algorithm: function :param algoParameters: parameters to transfer to the optimization algorithm :type algoParameters: dict :return: object containing the estimation results. :rtype: biogeme.results.bioResults Example:: # Create an instance of biogeme biogeme = bio.BIOGEME(database, logprob) # Gives a name to the model biogeme.modelName = 'mymodel' # Estimate the parameters results = biogeme.quickEstimate() :raises biogemeError: if no expression has been provided for the likelihood """ if self.loglike is None: raise excep.biogemeError( 'No log likelihood function has been specificed') if len(self.freeBetaNames) == 0: raise excep.biogemeError(f'There is no parameter to estimate' f' in the formula: {self.loglike}.') self.algorithm = algorithm self.algoParameters = algoParameters start_time = datetime.now() # yep.start('profile.out') # yep.stop() output = self.optimize(self.betaInitValues) xstar, optimizationMessages = output ## Running time of the optimization algorithm optimizationMessages['Optimization time'] = datetime.now() - start_time ## Information provided by the optimization algorithm after completion. self.optimizationMessages = optimizationMessages f = self.calculateLikelihood(xstar, scaled=False) fgHb = f, None, None, None rawResults = res.rawResults(self, xstar, fgHb, bootstrap=self.bootstrap_results) r = res.bioResults(rawResults) return r
def getNormalWichuraDraws(sampleSize, numberOfDraws, uniformNumbers=None, antithetic=False): """Generate pseudo-random numbers from a normal distribution N(0, 1) It uses the Algorithm AS241 Appl. Statist. (1988) Vol. 37, No. 3, which produces the normal deviate z corresponding to a given lower tail area of p; z is accurate to about 1 part in :math:`10^{16}`. :param sampleSize: number of observations for which draws must be generated. If None, a one dimensional array will be generated. If it has a values k, then k series of draws will be generated :type sampleSize: int :param numberOfDraws: number of draws to generate. :type numberOfDraws: int :param uniformNumbers: numpy with uniformly distributed numbers. If None, the numpy uniform number generator is used. :type uniformNumbers: numpy.array :param antithetic: if True, only half of the draws are actually generated, and the series are completed with their antithetic version. :type antithetic: bool :return: numpy array with the draws :rtype: numpy.array Example:: draws = dr.getNormalWichuraDraws(sampleSize=3, numberOfDraws=10) array([[ 0.52418458, -1.04344204, -2.11642482, 0.48257162, -2.67188279, -1.89993283, 0.28251041, -0.38424425, 1.53182226, 0.30651874], [-0.7937038 , -0.07884121, -0.91005616, -0.98855175, 1.09405753, -0.5997651 , -1.70785113, 1.57571384, -0.33208723, -1.03510102], [-0.13853654, 0.92595498, -0.80136586, 1.68454196, 0.9955927 , -0.28615154, 2.10635541, 0.0436191 , -0.25417774, 0.01026933]]) """ if numberOfDraws <= 0: raise excep.biogemeError(f'Invalid number of draws: {numberOfDraws}.') if antithetic: if 2 * int(numberOfDraws / 2) != numberOfDraws: errorMsg = (f'Please specify an even number of draws for ' f'antithetic draws. Requested number of ' f'{numberOfDraws}.') raise excep.biogemeError(errorMsg) numberOfDraws = int(numberOfDraws / 2) if sampleSize <= 0: raise excep.biogemeError( f'Invalid sample size: {sampleSize} when generating draws.') totalSize = numberOfDraws * sampleSize split2 = 5.e+00 const1 = 0.180625e+00 const2 = 1.6e+00 a0 = 3.3871328727963666080e+00 a1 = 1.3314166789178437745e+02 a2 = 1.9715909503065514427e+03 a3 = 1.3731693765509461125e+04 a4 = 4.5921953931549871457e+04 a5 = 6.7265770927008700853e+04 a6 = 3.3430575583588128105e+04 a7 = 2.5090809287301226727e+03 b1 = 4.2313330701600911252e+01 b2 = 6.8718700749205790830e+02 b3 = 5.3941960214247511077e+03 b4 = 2.1213794301586595867e+04 b5 = 3.9307895800092710610e+04 b6 = 2.8729085735721942674e+04 b7 = 5.2264952788528545610e+03 c0 = 1.42343711074968357734e+00 c1 = 4.63033784615654529590e+00 c2 = 5.76949722146069140550e+00 c3 = 3.64784832476320460504e+00 c4 = 1.27045825245236838258e+00 c5 = 2.41780725177450611770e-01 c6 = 2.27238449892691845833e-02 c7 = 7.74545014278341407640e-04 d1 = 2.05319162663775882187e+00 d2 = 1.67638483018380384940e+00 d3 = 6.89767334985100004550e-01 d4 = 1.48103976427480074590e-01 d5 = 1.51986665636164571966e-02 d6 = 5.47593808499534494600e-04 d7 = 1.05075007164441684324e-09 e0 = 6.65790464350110377720e+00 e1 = 5.46378491116411436990e+00 e2 = 1.78482653991729133580e+00 e3 = 2.96560571828504891230e-01 e4 = 2.65321895265761230930e-02 e5 = 1.24266094738807843860e-03 e6 = 2.71155556874348757815e-05 e7 = 2.01033439929228813265e-07 f1 = 5.99832206555887937690e-01 f2 = 1.36929880922735805310e-01 f3 = 1.48753612908506148525e-02 f4 = 7.86869131145613259100e-04 f5 = 1.84631831751005468180e-05 f6 = 1.42151175831644588870e-07 f7 = 2.04426310338993978564e-15 if uniformNumbers is None: uniformNumbers = np.random.uniform(size=totalSize) elif uniformNumbers.size != totalSize: errorMsg = (f'A total of {totalSize} uniform draws must be ' f'provided, and not {uniformNumbers.size}.') raise excep.biogemeError(errorMsg) uniformNumbers.shape = (totalSize, ) q = uniformNumbers - 0.5 draws = np.zeros(uniformNumbers.shape) r = np.zeros(uniformNumbers.shape) cond1 = np.abs(uniformNumbers) <= 0.45 r[cond1] = const1 - q[cond1] * q[cond1] draws[cond1] = q[cond1] *\ (((((((a7 * r[cond1] + a6) *\ r[cond1] + a5) *\ r[cond1] + a4) *\ r[cond1] + a3) *\ r[cond1] + a2) *\ r[cond1] + a1) *\ r[cond1] + a0) /\ (((((((b7 * r[cond1] + b6) *\ r[cond1] + b5) *\ r[cond1] + b4) *\ r[cond1] + b3) *\ r[cond1] + b2) *\ r[cond1] + b1) *\ r[cond1] + 1) cond2 = np.abs(uniformNumbers) > 0.45 cond2a = np.logical_and(cond2, q < 0.0) cond2b = np.logical_and(cond2, q >= 0.0) r[cond2a] = uniformNumbers[cond2a] r[cond2b] = 1 - uniformNumbers[cond2b] cond2c = np.logical_and(cond2, r <= 0) cond2d = np.logical_and(cond2, r > 0) draws[cond2c] = 0.0 r[cond2d] = np.sqrt(-np.log(r[cond2d])) cond2d_a = np.logical_and(cond2d, r <= split2) cond2d_b = np.logical_and(cond2d, r > split2) r[cond2d_a] = r[cond2d_a] - const2 draws[cond2d_a] = (((((((c7 * r[cond2d_a] + c6) *\ r[cond2d_a] + c5) *\ r[cond2d_a] + c4) *\ r[cond2d_a] + c3) *\ r[cond2d_a] + c2) *\ r[cond2d_a] + c1) *\ r[cond2d_a] + c0) /\ (((((((d7 * r[cond2d_a] + d6) *\ r[cond2d_a] + d5) *\ r[cond2d_a] + d4) *\ r[cond2d_a] + d3) *\ r[cond2d_a] + d2) *\ r[cond2d_a] + d1) *\ r[cond2d_a] + 1) r[cond2d_b] = r[cond2d_b] - split2 draws[cond2d_b] = (((((((e7 * r[cond2d_b] + e6) *\ r[cond2d_b] + e5) *\ r[cond2d_b] + e4) *\ r[cond2d_b] + e3) *\ r[cond2d_b] + e2) *\ r[cond2d_b] + e1) *\ r[cond2d_b] + e0) /\ (((((((f7 * r[cond2d_b] + f6) *\ r[cond2d_b] + f5) *\ r[cond2d_b] + f4) *\ r[cond2d_b] + f3) *\ r[cond2d_b] + f2) *\ r[cond2d_b] + f1) *\ r[cond2d_b] + 1) draws[cond2a] = -draws[cond2a] draws.shape = (sampleSize, numberOfDraws) if antithetic: draws = np.concatenate((draws, -draws), axis=1) return draws
def __init__(self, name, pandasDatabase): """Constructor :param name: name of the database. :type name: string :param pandasDatabase: data stored in a pandas data frame. :type pandasDatabase: pandas.DataFrame """ self.logger = msg.bioMessage() start_time = datetime.now() ## Name of the database. Used mainly for the file name when dumping data. self.name = name ## Pandas data frame containing the data. self.data = pandasDatabase self.fullData = pandasDatabase ## self.variables is initialized by _generateHeaders() self.variables = None self._generateHeaders() ## Number of observations removed by the function Database.remove self.excludedData = 0 ## Name of the column identifying the individuals in a panel ## data context. None if data is not panel. self.panelColumn = None ## map identifying the range of observations for each ## individual in a panel data context. None if data is not ## panel. self.individualMap = None self.fullIndividualMap = None ## Initialize the dictionary containing random number ## generators with a series of native generators. self._initNativeRandomNumberGenerators() ## Dictionary containing user defined random number ## generators. Defined by the function ## Database.setRandomNumberGenerators that checks that ## reserved keywords are not used. The element of the ## dictionary is a tuple with two elements: (0) the function ## generating the draws, and (1) a string describing the type of draws self.userRandomNumberGenerators = dict() ## Number of draws generated by the function Database.generateDraws. ## Value 0 if this function is not called. self.numberOfDraws = 0 ## Types of draws for Monte Carlo integration self.typesOfDraws = {} self._auditDone = False ## Draws for Monte-Carlo integration self.theDraws = None ## Availability expression to check self._avail = None ## Choice expression to check self._choice = None ## Expression to check self._expression = None listOfErrors, listOfWarnings = self._audit() if listOfWarnings: self.logger.warning('\n'.join(listOfWarnings)) if listOfErrors: self.logger.warning('\n'.join(listOfErrors)) raise excep.biogemeError('\n'.join(listOfErrors))
def hamabs(fct, initBetas, fixedBetas, betaIds, bounds, parameters=None): """ Algorithm inspired by `Lederrey et al. (2019)` .. _`Lederrey et al. (2019)`: https://transp-or.epfl.ch/documents/technicalReports/LedLurHilBie19.pdf :param fct: object to calculate the objective function and its derivatives. :type obj: optimization.functionToMinimize :param initBetas: initial value of the parameters. :type initBetas: numpy.array :param fixedBetas: betas that stay fixed suring the optimization. :type fixedBetas: numpy.array :param betaIds: internal identifiers of the non fixed betas. :type betaIds: numpy.array :param bounds: list of tuples (ell,u) containing the lower and upper bounds for each free parameter. Note that this algorithm does not support bound constraints. Therefore, all the bounds must be None. :type bounds: list(tuples) :param parameters: dict of parameters to be transmitted to the optimization routine: - tolerance: when the relative gradient is below that threshold, the algorithm has reached convergence (default: :math:`\\varepsilon^{\\frac{1}{3}}`); - maxiter: the maximum number of iterations (default: 100). :type parameters: dict(string:float or int) :return: tuple x, messages, where - x is the solution found, - messages is a dictionary reporting various aspects related to the run of the algorithm. :rtype: numpy.array, dict(str:object) """ for l, u in bounds: if l is not None or u is not None: raise excep.biogemeError( 'This algorithm does not handle bound constraints. Remove the bounds, or select another algorithm.' ) tol = np.finfo(np.float64).eps**0.3333 maxiter = 1000 # The size of the first batch is such that it can be increased 5 times firstBatch = 1.0 / 2.0**4 # The critical of the batch when BFGS is applied allows for 2 increases hybrid = 1.0 / 2.0**2 firstRadius = 1.0 # Premature convergence for small batch sizes #scaleEps = 10.0 # Maximum number of iterations before updating the batch size maxFailure = 2 dogleg = False eta1 = 0.01 eta2 = 0.9 if parameters is not None: if 'tolerance' in parameters: tol = parameters['tolerance'] if 'maxiter' in parameters: maxiter = parameters['maxiter'] if 'firstBatch' in parameters: firstBatch = parameters['firstBatch'] if 'firstRadius' in parameters: firstRadius = parameters['firstRadius'] if 'hybrid' in parameters: hybrid = parameters['hybrid'] if 'maxFailure' in parameters: maxFailure = parameters['maxFailure'] if 'scaleEps' in parameters: scaleEps = parameters['scaleEps'] if 'dogleg' in parameters: dogleg = parameters['dogleg'] if 'eta1' in parameters: eta1 = parameters['eta1'] if 'eta2' in parameters: eta2 = parameters['eta2'] logger.detailed("** Optimization: HAMABS") avging = smoothing() k = 0 xk = initBetas batch = firstBatch fct.setVariables(xk) f, g, h = fct.f_g_h(batch=batch) avgf, avgg, avgh = avging.add(f, g, h, batch) typx = np.ones(np.asarray(xk).shape) typf = max(np.abs(f), 1.0) if batch == 1.0: relgrad = opt.relativeGradient(xk, f, g, typx, typf) if relgrad <= tol: message = f"Relative gradient = {relgrad} <= {tol}" return xk, 0, 1, message delta = firstRadius cont = True maxDelta = np.finfo(float).max minDelta = np.finfo(float).eps # Collect statistics per iteration # columns = ['Batch','f','relgrad','Time','AbsDiff', 'RelDiff', 'AbsEff', 'RelEff'] # stats = pd.DataFrame(columns=columns) while cont: logger.debug(f'***************** Iteration {k} **************') logger.debug( f'N={avging.numberOfValues()} xk={xk} avgf={avgf} delta={delta}') k += 1 if batch <= hybrid: success, xc, fc, gc, hc, delta = generateCandidateSecondOrder( fct, xk, avgf, avgg, avgh, batch, delta, dogleg, maxFailure, maxDelta, eta1, eta2) else: success, xc, fc, gc, hc, delta = generateCandidateFirstOrder( fct, xk, avgf, avgg, avgh, batch, delta, dogleg, maxFailure, maxDelta, eta1, eta2) if success: xk = xc avgf, avgg, avgh = avging.add(fc, gc, hc, batch) if batch == 1.0: relgrad = opt.relativeGradient(xk, avgf, avgg, typx, typf) if relgrad <= tol: message = f"Relative gradient = {relgrad} <= {tol}" cont = False else: if batch < 1.0: batch = min(2.0 * batch, 1.0) delta = firstRadius if batch <= hybrid: fct.setVariables(xk) f, g, h = fct.f_g_h(batch=batch) avgf, avgg, avgh = avging.add(f, g, h, batch) else: fct.setVariables(xk) f, g = fct.f_g(batch=batch) avgf, avgg, _ = avging.add(f, g, None, batch) if delta <= minDelta: if batch == 1.0: message = f"Trust region is too small: {delta}" cont = False if k == maxiter: message = f"Maximum number of iterations reached: {maxiter}" cont = False logger.detailed( f"{k} f={avgf:10.7g} delta={delta:6.2g} batch={100*batch:6.2g}%") logger.detailed(message) messages = { 'Algorithm': 'HAMABS prototype', 'Relative gradient': relgrad, 'Cause of termination': message, 'Number of iterations': k } return xk, messages