def test__bfgs_optimization(): """ Ensure that the bfgs optimization properly processes the input for one iteration. The value of 0.4044 was computed by hand for comparison purposes """ X_, y_ = X.reshape(N, J, K), y.reshape(N, J) betas = np.array([.1, .1]) model = MultinomialLogit() res = model._bfgs_optimization(betas, X_, y_, None, None, 0) assert pytest.approx(res['fun'], 0.40443136)
def test_fit(): """ Ensures the log-likelihood works for a single iterations with the default initial coefficients. The value of 0.4044 was computed by hand for comparison purposes """ model = MultinomialLogit() model.fit(X, y, varnames=varnames, alts=alts, ids=ids, maxiter=0, verbose=0) assert pytest.approx(model.loglikelihood, -0.40443136)
def test_log_likelihood(): """ Computes the log-likelihood "by hand" for a simple example and ensures that the one returned by xlogit is the same """ X_, y_ = X.reshape(N, J, K), y.reshape(N, J) betas = np.array([.1, .1]) # Compute log likelihood using xlogit model = MultinomialLogit() obtained_loglik, _, _ = model._loglik_and_gradient(betas, X_, y_, None, None) # Compute expected log likelihood "by hand" eXB = np.exp(X_.dot(betas)) expected_loglik = np.sum( np.log(np.sum(eXB / np.sum(eXB, axis=1, keepdims=True) * y_, axis=1))) assert pytest.approx(expected_loglik, obtained_loglik)
def example9_run(): df = pd.read_csv( "https://raw.githubusercontent.com/arteagac/xlogit/master/examples/data/electricity_long.csv" ) varnames = ["pf", "cl", "loc", "wk", "tod", "seas"] X = df[varnames].values y = df['choice'].values choice_id = df['chid'] alt = [1, 2, 3, 4] np.random.seed(123) model = MultinomialLogit() model.fit( X=df[varnames], y=y, varnames=varnames, isvars=[], alts=alt, fit_intercept=True, # hess=False, # grad=False, # method="L-BFGS-B" # tol=1e-4, # scipy_optimisation=True ) model.summary()
def example10_run(): # df = pd.read_csv("examples_prit/Final_HBW_WC_Long.csv") df = pd.read_csv("xlogitprit/examples_prit/Final_HBW_WC_Long.csv") # Accessibility Time (One coefficient per alternative) df['ACT_PT'] = df['act'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) # Waiting Time (One coefficient per alternative) df['WT_PT'] = df['wt'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) df['EMP_DENS'] = df['emp_dens'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) df['ADUL_VEH'] = df['adul_veh'] * ((df['alt'] == 'cad') | (df['alt'] == 'pr')) #To be provided by the user choice_id = df['TRIPID'] ind_id = df['TRIPID'] varnames = ['tt', 'tc', 'ACT_PT', 'WT_PT', 'EMP_DENS', 'ADUL_VEH'] # asvarnames = ['tt','tc','ACT_PT', 'WT_PT', 'EMP_DENS','ADUL_VEH'] isvarnames = [] X = df[varnames].values y = df['Chosen_Mode'].values choice_set = ['cad', 'cap', 'w2pt', 'pr', 'kr', 'cycle', 'walk'] choice_var = df['Chosen_Mode'] alt_var = df['alt'] randvars = {'EMP_DENS': 'n', 'WT_PT': 'u'} R = 200 Tol = 1e-6 model = MultinomialLogit() # init_coeff = [-2, -4, -3, -2, -1, -1, 0, 0, -0, -0.01, 0.0001, -1.15] model.fit( X=df[varnames], y=choice_var, varnames=varnames, # init_coeff=np.repeat(.1, 11), isvars=isvarnames, alts=alt_var, ids=choice_id, # gtol=1e-1, # randvars=randvars, fit_intercept=True, hess=False, gtol=1e-1, # weights=[1, 1, 10, 10, 10, 100, 1] ) #, init_coeff=init_coeff, tol=1e-2) #hess=False, grad=False) model.summary()
def example6_run(): df = pd.read_csv("https://raw.githubusercontent.com/arteagac/xlogit/master/examples/data/fishing_long.csv") varnames = ['price', 'catch', 'income'] X = df[varnames].values y = df['choice'].values asvarnames = ['price'] isvarnames = ['income', 'catch'] rand_vars = {'price': 'n'} alts = [1, 2, 3, 4] # choice_id = df['chid'] model = MultinomialLogit() # N, _ = Xd N = len(np.unique(df['id'].values)) training_size = int(0.8*N) ids = np.random.choice(N, training_size, replace=False) train_idx = [ii for ii, id_val in enumerate(df['id']) if id_val in ids] test_idx = [ii for ii, id_val in enumerate(df['id']) if id_val not in ids] X_train = X[train_idx] y_train = y[train_idx] X_test = X[test_idx] y_test = y[test_idx] model.fit(X_train, y_train, varnames=varnames, alts=alts, fit_intercept=True, isvars=isvarnames # scipy_optimisation=False, # isvars=isvarnames, #transvars=['price', 'catch'], # randvars=rand_vars, #transvars=['price', 'catch'], #, hess=False, grad=False ) model.summary() model.validation_loglik(X_test, y_test)
def example4_run(): df = pd.read_csv("https://raw.githubusercontent.com/arteagac/xlogit/master/examples/data/electricity_long.csv") varnames = ["pf", "cl", "loc", "wk", "tod", "seas"] X = df[varnames].values y = df['choice'].values # df['seas'] = -df choice_id = df['chid'] alts = [1, 2, 3, 4] np.random.seed(123) model = MultinomialLogit() maxiter = 1000 model.fit(X=X, y=y, varnames=varnames, isvars=[], alts=alts, fit_intercept=True, transformation="boxcox", maxiter=maxiter, gtol=1e-3) model.summary()
def example5_run(): df = pd.read_csv("https://raw.githubusercontent.com/arteagac/xlogit/master/examples/data/electricity_long.csv") varnames = ["pf", "cl", "loc", "wk", "tod", "seas"] # df['tod'] = -df['tod'] # df['seas'] = -df['seas'] # print('sum', sum(np.where(df))) # print("df['seas']", df['seas']) # print(1/0) X = df[varnames].values y = df['choice'].values choice_id = df['chid'] alt = [1, 2, 3, 4] np.random.seed(123) print('covariance', np.cov(np.transpose(X))) # print(1/0) model = MultinomialLogit() model.fit(X, y, varnames, alts=alt, # randvars={'seas': 'ln', 'wk': 'n', 'pf': 'n', 'loc': 'n'}, fit_intercept=True, # transformation="boxcox", # transvars=['wk', 'seas'], # correlation=True, # ids=choice_id, # panels=df.id.values, # tol=1e-4, # grad=False, # hess=False, isvars=[], # verbose=1, # halton=False, # method='L-BFGS-B', # n_draws=600 ) model.summary()
def example13_run(): df = pd.read_csv("xlogitprit/examples_prit/Final_HBW_WC_Long.csv") # Accessibility Time (One coefficient per alternative) df['ACT_PT'] = df['act'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) # Waiting Time (One coefficient per alternative) df['WT_PT'] = df['wt'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) df['EMP_DENS'] = df['emp_dens'] * ((df['alt'] == 'w2pt') | (df['alt'] == 'pr') | (df['alt'] == 'kr')) df['ADUL_VEH'] = df['adul_veh'] * ((df['alt'] == 'cad') | (df['alt'] == 'pr')) #To be provided by the user choice_id = df['TRIPID'] ind_id = df['TRIPID'] varnames = ['tt', 'tc', 'ACT_PT', 'WT_PT', 'EMP_DENS', 'ADUL_VEH'] # asvarnames = ['tt','tc','ACT_PT', 'WT_PT', 'EMP_DENS','ADUL_VEH'] isvarnames = [] X = df[varnames].values y = df['Chosen_Mode'].values choice_set = ['cad', 'cap', 'w2pt', 'pr', 'kr', 'cycle', 'walk'] choice_var = df['Chosen_Mode'] alt_var = df['alt'] randvars = {'EMP_DENS': 'n', 'WT_PT': 'u'} R = 200 Tol = 1e-6 def df_coeff_col(seed, dataframe, names_asvars, choiceset, var_alt): np.random.seed(seed) random_matrix = np.random.randint(1, len(choiceset) + 1, (len(choiceset), len(names_asvars))) #print(random_matrix) ## Finding coefficients type (alt-specific or generic) for corresponding variables alt_spec_pos = [] for i in range(random_matrix.shape[1]): pos_freq = pd.Series(range(len(random_matrix[:, i]))).groupby( random_matrix[:, i], sort=False).apply(list).tolist() alt_spec_pos.append(pos_freq) for i in range(len(alt_spec_pos)): for j in range(len(alt_spec_pos[i])): for k in range(len(alt_spec_pos[i][j])): alt_spec_pos[i][j][k] = choiceset[alt_spec_pos[i][j][k]] ## creating dummy columns based on the coefficient type asvars_new = [] for i in range(len(alt_spec_pos)): for j in range(len(alt_spec_pos[i])): if len(alt_spec_pos[i][j]) < len(choiceset): dataframe[names_asvars[i] + '_' + '_'.join(alt_spec_pos[i][j])] = dataframe[ names_asvars[i]] * np.isin( var_alt, alt_spec_pos[i][j]) asvars_new.append(names_asvars[i] + '_' + '_'.join(alt_spec_pos[i][j])) else: asvars_new.append(names_asvars[i]) return (asvars_new) new_asvars = df_coeff_col(1, df, varnames, choice_set, alt_var) varnames = new_asvars model = MultinomialLogit() # init_coeff = [-2, -4, -3, -2, -1, -1, 0, 0, -0, -0.01, 0.0001, -1.15] model.fit( X=df[varnames], y=choice_var, varnames=varnames, # init_coeff=np.repeat(.1, 11), isvars=[], alts=alt_var, ids=choice_id, # gtol=1e-1, # randvars=randvars, fit_intercept=True, # hess=False, # gtol=1e-1, # weights=[1, 1, 10, 10, 10, 100, 1] ) #, init_coeff=init_coeff, tol=1e-2) #hess=False, grad=False) model.summary()
#To be provided by the user choice_id = df['TRIPID'] ind_id = df['TRIPID'] varnames = ['tt', 'tc', 'ACT_PT', 'WT_PT', 'EMP_DENS', 'ADUL_VEH'] # asvarnames = ['tt','tc','ACT_PT', 'WT_PT', 'EMP_DENS','ADUL_VEH'] isvarnames = [] X = df[varnames].values y = df['Chosen_Mode'].values choice_set = ['cad', 'cap', 'w2pt', 'pr', 'kr', 'cycle', 'walk'] choice_var = df['Chosen_Mode'] alt_var = df['alt'] randvars = {'EMP_DENS': 'n', 'WT_PT': 'u'} R = 200 Tol = 1e-6 model = MultinomialLogit() # init_coeff = [-2, -4, -3, -2, -1, -1, 0, 0, -0, -0.01, 0.0001, -1.15] model.fit( X=df[varnames], y=choice_var, varnames=varnames, # init_coeff=np.repeat(.1, 11), isvars=isvarnames, alts=alt_var, ids=choice_id, # gtol=1e-1, # randvars=randvars, fit_intercept=True, hess=False # gtol=1e-1, # weights=[1, 1, 10, 10, 10, 100, 1] ) #, init_coeff=init_coeff, tol=1e-2) #hess=False, grad=False) model.summary()
(df['alt'] == 'sm')) # Coefficient Age for train df['age_train'] = df['AGE'] * (df['alt'] == 'train') # Coefficient Luggage for car df['luggage_car'] = df['LUGGAGE'] * (df['alt'] == 'car') # Coefficient seatsconfig for car df['seats'] = df['seatconf'] * (df['alt'] == 'sm') varnames = [ 'asc_train', 'asc_car', 'cost', 'time', 'luggage_car', 'he_sm_train', 'seats', 'ga_sm_train', 'age_train' ] model = MultinomialLogit() model.fit( X=df[varnames], y=df['CHOICE'], varnames=varnames, alts=df['alt'], ids=df['custom_id'], avail=df['av'], weights=np.ones(2) # randvars={'cost': 'n'}, # transvars=['luggage_car'] # init_coeff=np.random.normal(0, 1, 9) # scipy_optimisation=False # method="L-BFGS-B" # tol=1e-3 )
# df['tod'] = -df['tod'] # df['seas'] = -df['seas'] # print('sum', sum(np.where(df))) # print("df['seas']", df['seas']) # print(1/0) X = df[varnames].values y = df['choice'].values choice_id = df['chid'] alt = [1, 2, 3, 4] np.random.seed(123) print('covariance', np.cov(np.transpose(X))) # print(1/0) model = MultinomialLogit() model.fit( X, y, varnames, alts=alt, # randvars={'seas': 'ln', 'wk': 'n', 'pf': 'n', 'loc': 'n'}, fit_intercept=True, # transformation="boxcox", # transvars=['wk', 'seas'], # correlation=True, # ids=choice_id, # panels=df.id.values, # tol=1e-4, # grad=False, # hess=False,
def example11_run(): # df_wide = pd.read_csv("examples/data/swissmetro_training.csv") df_wide = pd.read_csv("xlogitprit/examples/data/swissmetro_training.csv") df_wide['custom_id'] = np.arange(len(df_wide)) # Add unique identifier #Let's rename some columns for convenient reshaping using pandas df_wide.rename(columns={ "TRAIN_TT": "time_train", "SM_TT": "time_sm", "CAR_TT": "time_car", "TRAIN_CO": "cost_train", "SM_CO": "cost_sm", "CAR_CO": "cost_car", "TRAIN_HE": "headway_train", "SM_HE": "headway_sm", "SM_SEATS": "seatconf_sm", "TRAIN_AV": "av_train", "SM_AV": "av_sm", "CAR_AV": "av_car" }, inplace=True) # Convert from wide to long format using pandas. df = pd.wide_to_long( df_wide, ["time", "cost", "headway", "seatconf", "av"], i="custom_id", j="alt", sep="_", suffix='\w+').sort_values(by=['custom_id', 'alt']).reset_index() df = df.fillna(0) # Fill unexisting values for some alternatives # Format the outcome variable approapriately df["CHOICE"] = df["CHOICE"].map({1: 'train', 2: 'sm', 3: 'car'}) # Convert CHOICE to True if alternative was selected; False otherwise df["CHOICE"] = df["CHOICE"] == df["alt"] # Create model specification # Alternative Specific Constants df['asc_train'] = np.ones(len(df)) * (df['alt'] == 'train') df['asc_car'] = np.ones(len(df)) * (df['alt'] == 'car') # Coefficient GA for swissmetro and train df['ga_sm_train'] = df['GA'] * ((df['alt'] == 'train') | (df['alt'] == 'sm')) # Coefficient headway for swissmetro and train df['he_sm_train'] = df['headway'] * ((df['alt'] == 'train') | (df['alt'] == 'sm')) # Coefficient Age for train df['age_train'] = df['AGE'] * (df['alt'] == 'train') # Coefficient Luggage for car df['luggage_car'] = df['LUGGAGE'] * (df['alt'] == 'car') # Coefficient seatsconfig for car df['seats'] = df['seatconf'] * (df['alt'] == 'sm') varnames = [ 'asc_train', 'asc_car', 'cost', 'time', 'luggage_car', 'he_sm_train', 'seats', 'ga_sm_train', 'age_train' ] model = MultinomialLogit() model.fit( X=df[varnames], y=df['CHOICE'], varnames=varnames, alts=df['alt'], ids=df['custom_id'], avail=df['av'], # randvars={'cost': 'n'}, # transvars=['luggage_car'] # init_coeff=np.random.normal(0, 1, 9) # scipy_optimisation=False # method="L-BFGS-B" # tol=1e-3 ) model.summary()
import pandas as pd import numpy as np from xlogitprit import MultinomialLogit df = pd.read_csv( "https://raw.githubusercontent.com/arteagac/xlogit/master/examples/data/fishing_long.csv" ) varnames = ['price', 'catch'] X = df[varnames] y = df['choice'] asvarnames = ['price', 'catch'] isvarnames = [] rand_vars = {'price': 'n'} alts = [1, 2, 3, 4] # choice_id = df['chid'] model = MultinomialLogit() model.fit( X, y, varnames=varnames, alts=alts, isvars=isvarnames, transvars=['price', 'catch'], # randvars=rand_vars, #transvars=['price', 'catch'], fit_intercept=False #, hess=False, grad=False ) model.summary()
sep="_", suffix='\w+').sort_values(by=['custom_id', 'alt']).reset_index() # Fill unexisting values for some alternatives df = df.fillna(0) # Format the outcome variable approapriatly df["CHOICE"] = df["CHOICE"].map({1: 'train', 2: 'sm', 3: 'car'}) # Convert CHOICE to True if alternative was selected; False otherwise df["CHOICE"] = df["CHOICE"] == df["alt"] # Scale variables df['time'] = df['time'] / 100 train_pass = ((df["GA"] == 1) & (df["alt"].isin(['train', 'sm']))).astype(int) df['cost'] = df['cost'] * (train_pass == 0) / 100 # Create alternative specific constants df['asc_train'] = np.ones(len(df)) * (df['alt'] == 'train') df['asc_car'] = np.ones(len(df)) * (df['alt'] == 'car') varnames = [ 'asc_car', 'asc_train', 'cost', 'time', 'luggage_car', 'he_sm_train', 'seats', 'ga_sm_train', 'age_train' ] model = MultinomialLogit() model.fit(X=df[varnames], y=df['CHOICE'], varnames=varnames, alts=df['alt']) # model = MixedLogit() # model.fit(X=df[varnames], y=df['CHOICE'], varnames=varnames, alts=df['alt'], # # transvars=['cost'], # ids=df['custom_id'], avail=df['av'], randvars={'time': 'n'}, n_draws=2000, # # tol=1e-10 # ) model.summary()
import pandas as pd import time import matplotlib.pyplot as plt #To be provided by the user df = pd.read_csv( "https://raw.githubusercontent.com/timothyb0912/pylogit/master/examples/data/electricity_r_data_long.csv" ) choice_id = df['chid'] ind_id = df['id'] varnames = ['cl', 'loc', 'wk', 'tod', 'seas'] # asvarnames = ['pf','cl','loc','wk','tod', 'seas'] # isvarnames = [] alternatives = [1, 2, 3, 4] choice_var = df['choice'] alt_var = df['alt'] R = 200 dist = ['n', 'ln', 'tn', 'u', 't', 'f'] #dist = ['n', 'ln', 'u', 'f'] model = MultinomialLogit() # init_coeffs = np.repeat(.0, 6) model.fit(X=df[varnames], y=choice_var, varnames=varnames, alts=alt_var, ids=choice_id, transformation="boxcox", transvars=['cl'], fit_intercept=False) model.summary()