def pylogit_mxlogit_estimate(data, rhs_columns, random_varnames, num_draws=100, seed=None, print_result=False): spec = OrderedDict() variable_names = OrderedDict() for var in rhs_columns: spec[var] = [[1, 2]] variable_names[var] = [var] mixed_model = pl.create_choice_model(data=data, alt_id_col="alt", obs_id_col="group", choice_col="choice", specification=spec, model_type="Mixed Logit", names=variable_names, mixing_id_col='user_id', mixing_vars=random_varnames) numCoef = sum([len(spec[s]) for s in spec]) + len(random_varnames) if seed: mixed_model.fit_mle(np.zeros(numCoef), num_draws=num_draws, seed=seed) else: mixed_model.fit_mle(np.zeros(numCoef), num_draws=num_draws) if print_result: print(mixed_model.get_statsmodels_summary()) return mixed_model
def create_model(dataframe, n_feats, specs, spec_names): # Fit to a multinomial logit model (MNL) choice_model = pl.create_choice_model(data=df, alt_id_col='alt_ids', obs_id_col='obs_ids', choice_col='choices', specification=specs, model_type="MNL", names=spec_names) # Specify the initial values and method for the optimization. choice_model.fit_mle(np.zeros(n_feats), print_res=False) fit_summary_print_output = choice_model.fit_summary summary_print_output = choice_model.summary summary = (summary_print_output.to_dict(orient="index")) fit_summary = (fit_summary_print_output.to_dict()) choice_model.get_statsmodels_summary() print(fit_summary_print_output) print(summary_print_output) return fit_summary, summary
def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. [TO DO: should we add pass-through parameters here, or take them all in the constructor?] Parameters - NOT YET IMPLEMENTED ---------- GPU : bool, optional GPU acceleration. coefrange : tuple of floats, optional Limits to which coefficients are held, in format (min, max). initial_values : 1D array, optional Initial values for the coefficients. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model( data=self._data, obs_id_col=self._observation_id_col, alt_id_col=self._alternative_id_col, choice_col=self._choice_col, specification=self._model_expression, names=self._model_labels, model_type='MNL') m.fit_mle(init_vals=self._initial_coefs) results = MultinomialLogitResults(self._estimation_engine, results=m) elif (self._estimation_engine == 'ChoiceModels'): model_design = dmatrix(self._model_expression, data=self._data, return_type='dataframe') # generate 2D array from choice column, for mnl_estimate() chosen = np.reshape(self._data[[self._choice_col]].as_matrix(), (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(model_design.as_matrix(), chosen, self._numalts) result_params = dict(log_likelihood=log_lik, fit_parameters=fit, x_names=model_design.design_info.column_names) results = MultinomialLogitResults(self._estimation_engine, results=result_params) return results
def _create_model(self): """ Create the pylogit model class """ self.pylogit_model = pl.create_choice_model( data=self.long_data, alt_id_col=_CHOICE_ID_COL, obs_id_col=_OBSERVATION_COL, choice_col=_CHOICE_COL, specification=self.specification, names=self.names, model_type='MNL')
def estimation_asym(self, model_mnl): # read the data long_testing_data = pd.read_csv(self.output_file) # Set up the asym specifaction and names dictionary asym_specification = OrderedDict() asym_names = OrderedDict() for col in basic_specification: if col != "intercept": asym_specification[col] = basic_specification[col] asym_names[col] = basic_names[col] asym_intercept_names = basic_names["intercept"] # the "index" of the alternative whose constant has been constrained asym_intercept_ref_pos = 4 # "shape_TAS" is not presented asym_shape_names = ["shape_NSW", "shape_VIC", "shape_QLD", "shape_SA"] number_of_initial_values = len(asym_shape_names) # the "index" of the alternative whose shape parameter is constrained asym_ref = 4 print("################################################## Asymmetry Model #########################################") model_asym = pl.create_choice_model( data=long_testing_data, alt_id_col=self.custom_alt_id, obs_id_col=self.obs_id_column, choice_col=self.choice_column, specification=asym_specification, model_type="Asym", names=asym_names, shape_names=asym_shape_names, intercept_names=asym_intercept_names, shape_ref_pos=asym_ref, intercept_ref_pos=asym_intercept_ref_pos ) model_asym.fit_mle( None, init_shapes=np.zeros(number_of_initial_values), init_intercepts=model_mnl.params.values[:number_of_initial_values], init_coefs=model_mnl.params.values[number_of_initial_values:] / np.log(number_of_initial_values+1), method="bfgs" ) model_asym.get_statsmodels_summary() return model_asym
def getModel(alt, obs, choice, spec=specification, names=names, type_="MNL", data=raw_data): model = pl.create_choice_model(data=data, alt_id_col=alt, obs_id_col=obs, choice_col=choice, specification=specification, model_type=type_, names=names) return (model)
def create_model(dataframe: pd.DataFrame, feature_list: list): """Fitting multinomial logit model to the dataframe choices with specifications Args: dataframe (pd.DataFrame): Dataframe representation of the routes and the choices taken in our data feature_list (list[str]): list of (id#, featurename) tuples present in G.edges output specs (OrderedDict): Each category with an 'all_same' value to make sure the column ID does not change for the same attribute spec_names (OrderedDict): Customizable specs for each category for nesting cases Returns: (fit_summary, summary): fit_summary is a dictionary that shows the overall model's fit, summary is a dictionary that shows each input's fit in the model """ # This just means the columns will be consistent across choices spec_names = OrderedDict() specs = OrderedDict() for i in range(len(feature_list)): spec = feature_list[i] spec_names[spec] = spec specs[spec] = 'all_same' assert (len(specs) == len(spec_names)) # Fit to a multinomial logit model (MNL) choice_model = pl.create_choice_model(data=dataframe, alt_id_col='alt_ids', obs_id_col='obs_ids', choice_col='choices', specification=specs, model_type="MNL", names=spec_names) choice_model.fit_mle(np.zeros(len(feature_list)), print_res=False) fit_summary_print_output = choice_model.fit_summary summary_print_output = choice_model.summary summary = (summary_print_output.to_dict(orient="index")) fit_summary = (fit_summary_print_output.to_dict()) choice_model.get_statsmodels_summary() return fit_summary, summary
def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model( data=self._df, obs_id_col=self._observation_id_col, alt_id_col=self._alternative_id_col, choice_col=self._choice_col, specification=self._model_expression, names=self._model_labels, model_type='MNL') m.fit_mle(init_vals=self._initial_coefs) results = MultinomialLogitResults( estimation_engine=self._estimation_engine, model_expression=self._model_expression, results=m) elif (self._estimation_engine == 'ChoiceModels'): dm = dmatrix(self._model_expression, data=self._df) chosen = np.reshape(self._df[[self._choice_col]].values, (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts) result_params = dict(log_likelihood=log_lik, fit_parameters=fit, x_names=dm.design_info.column_names) results = MultinomialLogitResults( estimation_engine=self._estimation_engine, model_expression=self._model_expression, results=result_params) return results
def pylogit_logit_estimate(): spec = OrderedDict() variable_names = OrderedDict() for var in rhs_columns: spec[var] = [[1, 2]] variable_names[var] = [var] model = pl.create_choice_model(data=df, alt_id_col="alt", obs_id_col="group", choice_col="choice", specification=spec, model_type="MNL", names=variable_names) numCoef = sum([len(spec[s]) for s in spec]) model.fit_mle(np.zeros(numCoef)) print(model.get_statsmodels_summary()) return model
def logit_spec(long_data_df, alt_attr_vars, generic_attrs=[], constant=True, alts={0:'drive', 1:'cycle', 2:'walk', 3:'PT'}, ref_alt_ind=0): """ generate specification & varnames for pylogit Arguments: ------------------------------ long_data_df: pandas dataframe, long data, generated by long_form_data alt_attr_vars: list of alternative specific vars generic_attrs: list of case specific vars, generally demographic vars constant: whether or not to include ASCs alts: a dict or list to define indices and names of alternative ref_alt_ind: index of reference alternative for ASC specification Returns: -------------------------------- model: pylogit MNL model object numCoef: the number of coefficients to estimated """ specifications = OrderedDict() names = OrderedDict() nalt = len(alts) if isinstance(alts, list): alts = {i:i for i in alts} for var in alt_attr_vars: specifications[var] = [list(range(nalt))] names[var] = [var] for var in generic_attrs: specifications[var] = [i for i in range(nalt) if i != ref_alt_ind] names[var] = [var + ' for ' + alts[i] for i in alts if i != ref_alt_ind] if constant: specifications['intercept'] = [i for i in range(nalt) if i != ref_alt_ind] names['intercept'] = ['ASC for ' + alts[i] for i in alts if i != ref_alt_ind] model = pl.create_choice_model(data = long_data_df.copy(), alt_id_col="alt", obs_id_col="group", choice_col="choice", specification=specifications, model_type = "MNL", names = names ) numCoef = sum([len(specifications[s]) for s in specifications]) return model, numCoef
def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model(data = self._df, obs_id_col = self._observation_id_col, alt_id_col = self._alternative_id_col, choice_col = self._choice_col, specification = self._model_expression, names = self._model_labels, model_type = 'MNL') m.fit_mle(init_vals = self._initial_coefs) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = m) elif (self._estimation_engine == 'ChoiceModels'): dm = dmatrix(self._model_expression, data=self._df) chosen = np.reshape(self._df[[self._choice_col]].values, (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts) result_params = dict(log_likelihood = log_lik, fit_parameters = fit, x_names = dm.design_info.column_names) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = result_params) return results
def pylogitModel(data): basic_specification = OrderedDict() basic_names = OrderedDict() basic_specification['Affordable'] = [[1, 2, 3, 4, 5]] basic_names['Affordable'] = ['Affordable'] basic_specification['Ease'] = [[1, 2, 3, 4, 5]] basic_names['Ease'] = ['Ease'] basic_specification['Power'] = [[1, 2, 3, 4, 5]] basic_names['Power'] = ['Power'] basic_specification['Learning'] = [[1, 2, 3, 4, 5]] basic_names['Learning'] = ['Learning'] basic_specification['Supplements'] = [[1, 2, 3, 4, 5]] basic_names['Supplements'] = ['Supplements'] basic_specification['Support'] = [[1, 2, 3, 4, 5]] basic_names['Support'] = ['Support'] basic_specification['Needs'] = [[1, 2, 3, 4, 5]] basic_names['Needs'] = ['Needs'] basic_specification['IT'] = [[1, 2, 3, 4, 5]] basic_names['IT'] = ['IT'] basic_specification["intercept"] = [1, 2, 3, 4, 5] basic_names["intercept"] = ['Matlab', 'R', 'SAS', 'SPSS', 'Stata'] print(basic_names) print(basic_specification) mnl_model_r = pl.create_choice_model(data=data, alt_id_col='Alternative', obs_id_col='OrgID', choice_col='Choice', specification=basic_specification, model_type="MNL", names=basic_names) # Specify the initial values and method for the optimization. mnl_model_r.fit_mle(np.zeros(13)) # Look at the estimation results # print('1') # mnl_model_r.get_statsmodels_summary() print('2') mnl_model_r.print_summaries() return mnl_model_r
def estimation_mnl(self): long_testing_data = pd.read_csv(self.output_file) print("################################################ MNL Model ######################################") model_mnl = pl.create_choice_model( data=long_testing_data, alt_id_col=self.custom_alt_id, obs_id_col=self.obs_id_column, choice_col=self.choice_column, specification=basic_specification, model_type="MNL", names=basic_names ) model_mnl.fit_mle(np.zeros(total_num_parameters)) results = model_mnl.get_statsmodels_summary() print("########################", results) # all_situation_ids = np.sort(long_testing_data["choice_situation"].unique()) # prediction_ids = all_situation_ids[:2000] return model_mnl
range = list(range(6)) basic_specification["sp"] = "all_same" basic_names["sp"] = "sp" basic_specification["xp"] = "all_same" basic_names["xp"] = "xp" basic_specification["tim"] = "all_same" basic_names["tim"] = "tim" custom_alt_id = "r_box" obs_id_column = "uri" x_model = pl.create_choice_model(data=df, alt_id_col=custom_alt_id, obs_id_col=obs_id_column, choice_col="win", specification=basic_specification, model_type="MNL", names=basic_names) x_model.fit_mle(np.zeros(3)) summary = x_model.get_statsmodels_summary() print(summary)
def __init__(self, *args, **kwargs): self.wrapped_model = pylogit.create_choice_model(model_type="MNL", *args, **kwargs) return
def test_constructor(self): """ Construct the various choice models and make sure the constructed object has the necessary attributes. """ # Record the model types of all the models to be created all_model_types = model_type_to_display_name.keys() # Record the attribute / value pairs that are common to all models. common_attr_value_dict = {"data": self.fake_df, "name_spec": self.fake_names, "design": self.fake_design, "ind_var_names": self.fake_names["x"], "alt_id_col": self.alt_id_col, "obs_id_col": self.obs_id_col, "choice_col": self.choice_col, "specification": self.fake_specification, "alt_IDs": self.fake_df["alt_id"].values, "choices": self.fake_df["choice"].values} # Create a shape name dictionary to relate the various models to the # names of their shape parameters. shape_name_dict = {"MNL": None, "Asym": self.fake_shape_names[:2], "Cloglog": None, "Scobit": self.fake_shape_names, "Uneven": self.fake_shape_names, "Nested Logit": None, "Mixed Logit": None} # Create a shape reference position dictionary to relate the various # models to their shape reference positions. shape_ref_dict = {} for key in shape_name_dict: shape_ref_dict[key] = (None if key != "Asym" else self.fake_shape_ref_pos) # Create an intercept_names and intercept_ref_position dictionary to # relate the various models to their respective kwargs. intercept_names_dict = {} intercept_ref_dict = {} for key in shape_name_dict: if key in ["MNL", "Nested Logit", "Mixed Logit"]: intercept_names_dict[key] = None intercept_ref_dict[key] = None else: intercept_names_dict[key] = self.fake_intercept_names intercept_ref_dict[key] = self.fake_intercept_ref_pos # Create a nest_names dictionary to relate the various models to their # nest_name attributes nest_name_dict = {} nest_spec_dict = {} for key in shape_name_dict: if key != "Nested Logit": nest_name_dict[key] = None nest_spec_dict[key] = None else: nest_name_dict[key] = list(self.fake_nest_spec.keys()) nest_spec_dict[key] = self.fake_nest_spec # Create dictionaries for the mixing_id_col, mixing_vars, and # mixing_pos attributes mixing_id_col_dict = {} mixing_vars_dict = {} mixing_pos_dict = {} for key in shape_name_dict: if key != "Mixed Logit": mixing_id_col_dict[key] = None mixing_vars_dict[key] = None mixing_pos_dict[key] = None else: mixing_id_col_dict[key] = self.obs_id_col mixing_vars_dict[key] = self.fake_names["x"] mixing_pos_dict[key] = [0] # Record the attribute / value pairs that vary across models varying_attr_value_dict = {"model_type": model_type_to_display_name, "intercept_names": intercept_names_dict, "intercept_ref_position": intercept_ref_dict, "shape_names": shape_name_dict, "shape_ref_position": shape_ref_dict, "nest_names": nest_name_dict, "nest_spec": nest_spec_dict, "mixing_id_col": mixing_id_col_dict, "mixing_vars": mixing_vars_dict, "mixing_pos": mixing_pos_dict} # Set up the keyword arguments that are needed for each of the model # types variable_kwargs = {} for model_name in all_model_types: variable_kwargs[model_name] = {} variable_kwargs[model_name]["intercept_names"] =\ intercept_names_dict[model_name] variable_kwargs[model_name]["intercept_ref_pos"] =\ intercept_ref_dict[model_name] variable_kwargs[model_name]["shape_ref_pos"] =\ shape_ref_dict[model_name] variable_kwargs[model_name]["shape_names"] =\ shape_name_dict[model_name] variable_kwargs[model_name]["nest_spec"] =\ nest_spec_dict[model_name] variable_kwargs[model_name]["mixing_id_col"] =\ mixing_id_col_dict[model_name] variable_kwargs[model_name]["mixing_vars"] =\ mixing_vars_dict[model_name] # Execute the test for each model type for model_name in all_model_types: # Update the model type in the list of constructor args self.constructor_args[-1] = model_name # Use this specific model's keyword arguments self.constructor_kwargs.update(variable_kwargs[model_name]) # Construct the model object model_obj = pylogit.create_choice_model(*self.constructor_args, **self.constructor_kwargs) # Make sure that the constructor has all of the required attributes for attr in common_attr_value_dict: value = common_attr_value_dict[attr] if isinstance(value, pd.DataFrame): self.assertTrue(value.equals(model_obj.data)) elif isinstance(value, np.ndarray): npt.assert_allclose(value, model_obj.__getattribute__(attr)) else: self.assertEqual(value, model_obj.__getattribute__(attr)) for attr in varying_attr_value_dict: value = varying_attr_value_dict[attr][model_name] self.assertEqual(value, model_obj.__getattribute__(attr)) return None
long_lpmc = gld.generate_data(train=True) # train=False for generating the test dataset y = long_lpmc.copy() # standardize what has to be standardized : custom_id, mode_id etc.. are ignored y.iloc[:, 3::1] = helpers.standardize(long_lpmc.iloc[:, 3::1]) choice_column = "travel_mode" obs_id_column = "custom_id" custom_alt_id = "mode_id" basic_specification = helpers.create_specification() lpmc_mnltrain = pl.create_choice_model(data=y, alt_id_col=custom_alt_id, obs_id_col=obs_id_column, choice_col=choice_column, specification=basic_specification, model_type="MNL", names=None) #%% """ Defines relevant parameter for the simulations. Variables : num_simul= Number of desired simulation for the grid search of the hyperparameter lambda_lasso, lambda_ridge or the two. num_points = For each regularisation of the grid search, defines the number of simulation realised by addind parameters. maxiter = Number of iterations realised by Scipy 'minimize()' to optimized the parameters. num_param_keep = vector of length num_points which indicates how many
long_swiss_metro_train, long_swiss_metro_test = train_test_split( long_swiss_metro, train_size=SPLIT_EMBEDDINGS_DCM, shuffle=False) # In[39]: testsetsize = len(long_swiss_metro_test) / 3 trainsetsize = len(long_swiss_metro_train) / 3 print(testsetsize, trainsetsize) # In[40]: # Estimate the multinomial logit model (MNL) swissmetro_mnl = pl.create_choice_model(data=long_swiss_metro_train, alt_id_col=custom_alt_id, obs_id_col=obs_id_column, choice_col=choice_column, specification=basic_specification, model_type="MNL", names=basic_names) deg_freedom = sum([len(b) for b in basic_specification.values()]) # Specify the initial values and method for the optimization. swissmetro_mnl.fit_mle(np.zeros(deg_freedom)) # Look at the estimation results swissmetro_mnl.get_statsmodels_summary() # In[41]: long_probs = swissmetro_mnl.predict(long_swiss_metro_test) SCORE = sum([
data.loc[data['mode_id']==i, col_name] = confounder_vectors[int(i-1)][2] data[col_name] = data[col_name].fillna(0) data['confounder_all'] = data[['confounder_for_mode_1','confounder_for_mode_2','confounder_for_mode_3', 'confounder_for_mode_4', 'confounder_for_mode_5', 'confounder_for_mode_6', 'confounder_for_mode_7', 'confounder_for_mode_8']].sum(axis=1) # - # ## Estimate non-causal MNL # + # Estimate the basic MNL model, using the hessian and newton-conjugate gradient mnl_model = cm.create_choice_model(data=data, alt_id_col="mode_id", obs_id_col="observation_id", choice_col="choice", specification=mnl_specification, model_type="MNL", names=mnl_names) num_vars = len(reduce(lambda x, y: x + y, mnl_names.values())) # Note newton-cg used to ensure convergence to a point where gradient # is essentially zero for all dimensions. mnl_model.fit_mle(np.zeros(num_vars), method="BFGS") # Look at the estimation results mnl_model.get_statsmodels_summary() # - # ## Estimate Causal MNL
mode_shares.index = [ALT_ID_TO_MODE_NAME[x] for x in mode_shares.index.values] mode_shares.name = "Mode Shares" mode_shares # - # # Choice Model Estimation # For purposes of this task, we use the MNL specification from Brathwaite and Walker (2016) and estimate the model resulting from such a specification. We assume that the estimated model parameters represent the "true" model parameters. # + # Estimate the basic MNL model, using the hessian and newton-conjugate gradient mnl_model = pl.create_choice_model( data=bike_data_long, alt_id_col=ALT_ID_COL, obs_id_col=OBS_ID_COL, choice_col="choice", specification=MNL_SPECIFICATION, model_type="MNL", names=MNL_NAMES, ) num_vars = len(reduce(lambda x, y: x + y, MNL_NAMES.values())) # Note newton-cg used to ensure convergence to a point where gradient # is essentially zero for all dimensions. mnl_model.fit_mle(np.zeros(num_vars), method="BFGS") # Look at the estimation results mnl_model.get_statsmodels_summary() # -
for k in range(len(indices) - 1): # For each fold k, create the holdout set and the bagged set holdout = train.iloc[indices[k]:indices[k + 1], :].sort_values( by=["Year", "Country"]) bagged = train[~train.index.isin(holdout.index)].sort_values( by=["Year", "Country"]) # Create mixed logit model with year fixed-effects and random coefficients over countries model = pylogit.create_choice_model( data=bagged, alt_id_col="Status", obs_id_col="Year", choice_col="default_RR", # =1 for default, =0 for no default specification=basic_specification, model_type="Mixed Logit", # mixed panel logit model names=basic_names, mixing_id_col= "Country", # implies coefficients are randomized over countries mixing_vars=index_var_names) # Estimate mixed logit model using Nelder-Mead algorithm (cross-validated to choose optimal lambda) on K-1 folds model.fit_mle( init_vals=np.zeros(46), num_draws= 1000, # 1000 draws from independent normal distributions for each parameter, # as functions of their means and standard deviations #seed = 2, method="Nelder-Mead", # using Nelder-Mead algorithm maxiter=10, # number of Nelder-Mead iterations
# basic_specification["regular_class"] = [1] # basic_names["regular_class"] = ["First Class == False, (Swissmetro)"] # # basic_specification["single_luggage_piece"] = [3] # basic_names["single_luggage_piece"] = ["Number of Luggage Pieces == 1, (Car)"] # # basic_specification["multiple_luggage_pieces"] = [3] # basic_names["multiple_luggage_pieces"] = ["Number of Luggage Pieces > 1, (Car)"] # print basic_names # print basic_specification destination_mnl = pl.create_choice_model(data=long_testing_data, alt_id_col=custom_alt_id, obs_id_col=obs_id_column, choice_col=choice_column, specification=basic_specification, model_type="MNL", names=basic_names) destination_mnl.fit_mle(np.zeros(10)) print(destination_mnl.get_statsmodels_summary()) all_situation_ids = np.sort(long_testing_data["choice_situation"].unique()) prediction_ids = all_situation_ids[:2000] prediction_df = long_testing_data.loc[ long_testing_data["choice_situation"].isin(prediction_ids)].copy() # print(prediction_df) # This is the array of the predicted choice prediction_array = destination_mnl.predict(prediction_df) print(prediction_array)
def fit(self, df_comb, target, df_i=None, df_j=None, merge_columns=None): """ This function computes the maximum-likelihood estimate of model parameters given pairwise-comparison data, using optimizers provided by the ``scipy.optimize`` module. Parameters ------------ df_comb : DataFrame DataFrame with multi-index where each index is an entity and object of comparison made. This table should contain the target value and any other predictive features that the user would like to use and store on the observational level, for example the weather for a particular match. target : str name of target variable column. df_i : DataFrame, default: None DataFrame where the index is an entity and the values are features for predictions that the user would like to store on the entity level. For example the budget of a team. This information may be merged onto the df_comb table, however this is available for purpose of ease if the user has this stored in a way most relational database users would stored data. If the column names are repeated in df_i and df_comb there will be an error raised. df_j : DataFrame, default: None Same as df_i, but can be used in case the second entity compared is always of a different nature and isn't stored in the same DataFrame. For example movies might be compared with songs. merge_columns : list of str, default: None Any columns that exists in the df_comb DataFrame, df_i and df_j DataFrames that the users would like to also merge on such as year, note that the entities in the indices will be automatically considered. Returns --------- Self """ check_indexing_of_entities(df_comb) self.x_comb_entnames = df_comb.index.names.copy() # Remember the results column so that it can be removed later self.target_col_name = target self.rplc_lkp, self.lkp = generate_entity_lookup( get_distinct_entities(df_comb)) self.hyperparameters = { 'alpha': self.alpha, 'method': self.method, 'initial_params': self.initial_params, 'max_iter': self.max_iter, 'tol': self.tol } # Training with choix if df_i is None and df_j is None and \ (list(df_comb.columns) == [self.target_col_name]): training_data, n_ents = self.unpack_data_for_choix( df_comb, self.x_comb_entnames) # Fit Bradley Terry self._params = choix.opt_pairwise(n_ents, training_data["winner"], **self.hyperparameters) self.params_ = pd.DataFrame.from_dict(self.lkp, orient='index', columns=['entity']) self.params_['learned_strength'] = self._params.copy() self.is_fitted_ = True self.pylogit_fit = False # Training with pylogit else: if self.hyperparameters['method'] == "Newton-CG": warn( "Note that method specified for pylogit descent is" + " Newton-CG, at the point we last checked there was an" + " open issue regarding the Hessian not being correct which" + " is used for this type of optimization. If this issue has" + " been resolved in pylogit please contact us to remove this" + " warning") self.hyperparameters['ridge'] = self.alpha if df_i is not None: self.df_i = df_i.copy() else: self.df_i = None if df_j is not None: self.df_j = df_j.copy() else: self.df_j = None if merge_columns is not None: self.merge_columns = merge_columns.copy() else: self.merge_columns = None long_format = self.unpack_data_for_pylogit(df_comb, self.x_comb_entnames) x_comb = self.join_up_dataframes(long_format, df_i, df_j, merge_columns) basic_specification = OrderedDict() basic_names = OrderedDict() columns = 0 for i in x_comb.columns: if i not in ['observation', 'entity', 'CHOICE']: basic_specification[i] = [list(self.lkp.keys())] basic_names[i] = [i] columns += 1 basic_specification['intercept'] = list(self.lkp.keys()) basic_names['intercept'] = [str(i) for i in self.rplc_lkp.keys()] self.bt_with_feats = pl.create_choice_model( data=x_comb, alt_id_col='entity', obs_id_col='observation', choice_col='CHOICE', specification=basic_specification, model_type="MNL", names=basic_names) self.x_comb = x_comb.copy() with warnings.catch_warnings(): warnings.simplefilter("ignore") self.bt_with_feats.fit_mle(np.zeros(columns + len(self.lkp)), **self.hyperparameters, print_res=False) self.is_fitted_ = True self._feat_params = self.bt_with_feats.params.reset_index() self.params_ = self._feat_params[~self._feat_params['index']. isin(basic_names.keys())] self.params_.columns = ['entity', 'learned_strength'] self.pylogit_fit = True
"Headway, units:hrs, (Train)", "Headway, units:hrs, (Metro)" ] ########## # Determine the columns for: alternative ids, the observation ids and the choice ########## # The 'alternative_id' variable will identify the alternative associated with each row. alternative_id = "alt_id" # The 'obs_id' variable will identify the observation id associated with each row. observation_id = "obs_id" # Create a 'choice' variable which identifies the choice associated with each row. choice = "CHOICE" # Estimate the multinomial logit model (MNL) model_01_mnl = pl.create_choice_model(data=data_01, alt_id_col=alternative_id, obs_id_col=observation_id, choice_col=choice, specification=basic_specification, model_type="MNL", names=basic_names) # Specify the initial values and method for the optimization. model_01_mnl.fit_mle( np.zeros(8)) # 8 is the total number of parameters to be esimtated # Look at the estimation results model_01_mnl.get_statsmodels_summary()
basic_names["built_since_jan2010"] = ['built_since_jan2010'] basic_specification["work_dist"] = [list(set(long_data['choice_id']))] basic_names["work_dist"] = ['work_dist'] for poiField in poiFields: basic_specification[poiField] = [list(set(long_data['choice_id']))] basic_names[poiField] = [poiField] # interation of work distance and vehcile ownership: insignificant in the model # basic_specification["work_dist_veh"] = [list(set(long_data['choice_id']))] # basic_names["work_dist_veh"] = ['work_dist_veh'] home_loc_mnl = pl.create_choice_model(data=long_data, alt_id_col='choice_id', obs_id_col='custom_id', choice_col='choice', specification=basic_specification, model_type="MNL", names=basic_names) print('Fitting Model') numCoef = sum([len(basic_specification[s]) for s in basic_specification]) home_loc_mnl.fit_mle(np.zeros(numCoef)) # Look at the estimation results print(home_loc_mnl.get_statsmodels_summary()) pickle.dump(home_loc_mnl, open(FITTED_HOME_LOC_MODEL_PATH, 'wb')) json.dump(rent_normalisation, open(RENT_NORM_PATH, 'w'))
index=False) choiceModelPUMA_spec = OrderedDict() choiceModelPUMA_names = OrderedDict() choiceModelPUMAsRegressors = [ 'puma_pop_per_sqm', 'income_disparity', 'work_dist', 'media_norm_rent', 'num_houses' ] + [x for x in list(long_data_PUMA.columns) if x.endswith('_den')] for var in choiceModelPUMAsRegressors: choiceModelPUMA_spec[var] = [list(set(long_data_PUMA['choice_id']))] choiceModelPUMA_names[var] = [var] home_loc_mnl_PUMAs = pl.create_choice_model(data=long_data_PUMA, alt_id_col='choice_id', obs_id_col='custom_id', choice_col='choice', specification=choiceModelPUMA_spec, model_type="MNL", names=choiceModelPUMA_names) print('\n[info] Fitting Upper Level Model') numCoef = sum([len(choiceModelPUMA_spec[s]) for s in choiceModelPUMA_spec]) # pylogit may encounter memory error in calculating Hessiann matrix for S.E. in this model, if so, switch to noHessian approach and only do point estimation. try: home_loc_mnl_PUMAs.fit_mle(np.zeros(numCoef)) print(home_loc_mnl_PUMAs.get_statsmodels_summary()) home_loc_mnl['home_loc_mnl_PUMAs'] = { 'just_point': False, 'model': home_loc_mnl_PUMAs } except:
fig.savefig( str(pyprojroot.here("article/images/qq-plot-method-3.pdf")), dpi=500, bbox_inches="tight", ) # - # ## True Model # + # Estimate the basic MNL model, using the hessian and newton-conjugate gradient mnl_model = cm.create_choice_model( data=data, alt_id_col="mode_id", obs_id_col="observation_id", choice_col="sim_choice", specification=mnl_specification, model_type="MNL", names=mnl_names, ) num_vars = len(reduce(lambda x, y: x + y, mnl_names.values())) mnl_model.fit_mle(np.zeros(num_vars), method="BFGS") mnl_model.get_statsmodels_summary() # - # ## Model 1 # + # Create my specification and variable names for the basic MNL model # NOTE: - Keys should be variables within the long format dataframe.
spec = OrderedDict() variable_names = OrderedDict() Vars = ["TTME", "INVC", "INVT"] spec["intercept"] = [1,2,3] variable_names["intercept"] = ["ASC Air", "ASC Train", "ASC Bus"] for var in Vars: spec[var] = [[1,2,3,4]] variable_names[var] = [var] spec["HINC"] = [4] variable_names["HINC"] = ["HINC for Car"] spec["TTME"] = [[1], [2,3,4]] variable_names["TTME"] = ["TTME for Air", "TTME for Train/Bus/Car"] model = pl.create_choice_model(data = data, alt_id_col="ALT", obs_id_col="Group", choice_col="MODE", specification=spec, model_type = "MNL", names = variable_names ) model.fit_mle(np.zeros(8)) model.print_summaries() # #retrive model parameters # print("\n\nFollowings are attributes of the model object") # print(dir(model)) # print("\nFollowings are coefficients") # print(model.params.values) # print("\nFollowings are pvalues") # print(model.pvalues.values)
df[col] = df[col].astype(float) spec[col] = [[1, 2, 3]] spec_names[col] = [col] if profile: ini_ram = curr_ram() profiler = Profiler().start() np.random.seed(0) # Prints are temporarily disabled as pylogit has excessive verbosity sys.stdout, sys.stderr = io.StringIO(), io.StringIO() # Disable print model = pl.create_choice_model(data=df, alt_id_col=alt_id_col, obs_id_col=obs_id_col, choice_col=choice_col, specification=spec, mixing_vars=mixing_vars, model_type="Mixed Logit", names=spec_names, mixing_id_col=mixing_id_col) model.fit_mle(init_vals=np.zeros(len(varnames) + len(mixing_vars)), num_draws=n_draws, seed=123) sys.stdout, sys.stderr = sys.__stdout__, sys.__stderr__ # Enable print if profile: ellapsed, max_ram, max_gpu = profiler.stop() print("{:6} {:7.2f} {:11.2f} {:7.3f} {:7.3f} {}".format( n_draws, ellapsed, model.log_likelihood, max_ram - ini_ram, max_gpu, model.estimation_success)) profiler.export('pylogit', dataset, n_draws, ellapsed,
# 设置模型配置specification # pylogit通过有序字典来配置 # 每一个key都是要引入的变量,这个变量必须是数据中的一个列名,与key对应的value是一个list,list中的每个元素对应于一个模型系数 # 我们也可以设定一个变量名称字典,与模型匹配字典使用相同的key,其value也是一个list,可以以字符串的形式定义每个模型系数的名称 # 重点是在配置模型的过程中,如果系数是方案系数(系数保持不变),那么要使用一个内层list表示哪些备选项的效用中引入这个系数。 spec = OrderedDict() var_names = OrderedDict() vars = ['L', 'A', 'B'] for var in vars: spec[var] = [[1, 2, 3, 4, 5]] var_names[var] = ['beta of ' + var] # 建立模型object # data是数据,alt_id_col是标识备选项名称或编号的列名,obs_id_col是标识选择场景观测编号的列名,choicec_col是标识选择结果0/1的列名 # specification是设定好的模型配置有序字典,model_type选择MNL是一般的logit模型,names是设定好的变量名 model = pl.create_choice_model(data=data, alt_id_col='ALT', obs_id_col='GROUP', choice_col='MODE', specification=spec, model_type='MNL', names=var_names) # 注意进行估计时需要参数的初始估计值,可以设为0 model.fit_mle(np.zeros(3)) model.print_summaries() # 结果包括对数似然值、R2、3个参数的估计值、t检验、p值、robust统计量等。 # # # 通过DIR查看更多信息 print(dir(model)) # print(model.chi_square) print(model.coefs) # print(model.cov) # print(model.params.values) # print(model.pvalues) # # 利用模型进行预测,需要输入一个以同样方式组织的数据 # print(model.predict(data.iloc[0:5]))