def estimate(dset,config,year,show=True,variables=None): choosers = fetch_table(dset,config) if 'est_sample_size' in config: choosers = choosers.ix[np.random.choice(choosers.index, config['est_sample_size'],replace=False)] output_csv, output_title, coeff_name, output_varname = config["output_names"] assert 'alternatives' in config alternatives = eval(config['alternatives']) alternatives = merge(dset,alternatives,config) t1 = time.time() segments = [(None,choosers)] if 'segment' in config: for varname in config['segment']: if varname not in choosers.columns: choosers[varname] = calcvar(choosers,config,dset,varname) segments = choosers.groupby(config['segment']) for name, segment in segments: name = str(name) if name is not None: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv%name, output_title%name, coeff_name%name else: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv, output_title, coeff_name assert "dep_var" in config depvar = config["dep_var"] global SAMPLE_SIZE SAMPLE_SIZE = config["alt_sample_size"] if "alt_sample_size" in config else SAMPLE_SIZE sample, alternative_sample, est_params = interaction.mnl_interaction_dataset( segment,alternatives,SAMPLE_SIZE,chosenalts=segment[depvar]) print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) data = spec(alternative_sample,config,submodel=name) if show: print data.describe() data = data.as_matrix() fnames = config['ind_vars'] fnames = config['ind_var_names'] if 'ind_var_names' in config else fnames fit, results = interaction.estimate(data,est_params,SAMPLE_SIZE) fnames = interaction.add_fnames(fnames,est_params) if show: print misc.resultstotable(fnames,results) misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle) dset.store_coeff(tmp_coeffname,zip(*results)[0],fnames) print "Finished executing in %f seconds" % (time.time()-t1)
def estimate (dset,indvars,depvar = 'building_id',alternatives=None,SAMPLE_SIZE=100,max_segment_size = 1200,estimation_table = 'households_for_estimation', output_names=None,agents_groupby = ['income_3_tenure',]): ##HCLM ESTIMATION output_csv, output_title, coeff_name, output_varname = output_names choosers = dset.fetch(estimation_table) segments = choosers.groupby(agents_groupby) num_segments = len(segments.size().index) if num_segments != len(indvars): print "ERROR: number of segments does not match number of sets of independent variable" indvar_dict = dict(zip(segments.size().index.values,range(num_segments))) alts = alternatives for name, segment in segments: ind_vars = indvars[indvar_dict[name]] name = str(name) tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv%name, output_title%name, coeff_name%name if len(segment[depvar]) > max_segment_size: #reduce size of segment if too big so things don't bog down segment = segment.ix[np.random.choice(segment.index, max_segment_size,replace=False)] #,weight_var='residential_units') sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar]) ##Interaction variables interaction_vars = [(var, var.split('_x_')) for var in ind_vars if '_x_' in var] for ivar in interaction_vars: alternative_sample[ivar[0]] = ((alternative_sample[ivar[1][0]])*alternative_sample[ivar[1][1]]) print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) if len(segment.index) > 50: est_data = pd.DataFrame(index=alternative_sample.index) for varname in ind_vars: est_data[varname] = alternative_sample[varname] est_data = est_data.fillna(0) data = est_data.as_matrix() try: fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE) fnames = interaction.add_fnames(ind_vars,est_params) print misc.resultstotable(fnames,results) misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle) coefficients = zip(*results)[0]+(.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,) varnames = fnames+['county8001','county8005','county8013','county8014','county8019','county8031','county8035','county8039','county8047','county8059','county8123'] dset.store_coeff(tmp_coeffname,coefficients,varnames) except: print 'SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM' else: print 'SAMPLE SIZE TOO SMALL'
def lcmnl_estimate(cmdata,numclasses,csdata,numalts,chosen,maxiter=MAXITER,emtol=EMTOL,\ skipprep=False,csbeta=None,cmbeta=None,csfnames=None,cmfnames=None): loglik = -999999 l_0 = None if csbeta is None: csbeta = [np.random.rand(csdata.shape[1]) for i in range(numclasses)] if csfnames is None: csfnames = ['cs%d' % i for i in range(csdata.shape[1])] if cmfnames is None: cmfnames = ['cm%d' % i for i in range(cmdata.shape[1])] if not skipprep: cmdata, cmfnames = prep_cm_data(cmdata, numclasses, cmfnames) if cmbeta is None: cmbeta = np.random.rand(cmdata.shape[1]) * 10.0 - 5.0 results_d = {} for i in range(maxiter): print "Running iteration %d" % (i + 1) print time.ctime() # EXPECTATION def expectation(cmbeta, csbeta): print "Running class membership model" cmprobs = mnl.mnl_simulate(cmdata, cmbeta, numclasses, GPU=GPU, returnprobs=1) csprobs = [] for cno in range(numclasses): tmp = mnl.mnl_simulate(csdata, csbeta[cno], numalts, GPU=GPU, returnprobs=1) tmp = np.sum(tmp * chosen, axis=1) # keep only chosen probs csprobs.append(np.reshape(tmp, (-1, 1))) csprobs = np.concatenate(csprobs, axis=1) h = csprobs * cmprobs loglik = np.sum(np.log(np.sum(h, axis=1))) wts = h / np.reshape(np.sum(h, axis=1), (-1, 1)) return loglik, wts oldloglik = loglik loglik, wts = expectation(cmbeta, csbeta) if l_0 is None: l_0 = loglik print "current cmbeta", cmbeta print "current csbeta", csbeta print "current loglik", loglik, i + 1, "\n\n" if abs(loglik - oldloglik) < emtol: break # MAXIMIZATION for cno in range(numclasses): print "Estimating class specific model for class %d" % (cno + 1) t1 = time.time() weights = np.reshape(wts[:, cno], (-1, 1)) fit, results = mnl.mnl_estimate(csdata, chosen, numalts, GPU=GPU, weights=weights, beta=csbeta[cno]) print "Finished in %fs" % (time.time() - t1) csbeta[cno] = zip(*results)[0] results_d['cs%d' % cno] = results print "Estimating class membership model" t1 = time.time() fit, results = mnl.mnl_estimate(cmdata,None,numclasses,GPU=GPU,weights=wts,lcgrad=True, \ beta=cmbeta,coeffrange=(-1000,1000)) print "Finished in %fs" % (time.time() - t1) cmbeta = zip(*results)[0] results_d['cm'] = results l_1 = loglik l_0, foo = expectation(np.zeros(len(cmbeta)), [np.zeros(len(a)) for a in csbeta]) ll_ratio = 1 - (l_1 / l_0) print "Null Log-liklihood: %f" % l_0 print "Log-liklihood at convergence: %f" % l_1 print "Log-liklihood ratio: %f" % ll_ratio a = [] fnames = [] fnames += cmfnames a += results_d['cm'] for i in range(numclasses): fnames += ['%s cls%d' % (s, i) for s in csfnames] a += results_d['cs%d' % i] print misc.resultstotable(fnames, a) fit = (l_0, l_1, ll_ratio) misc.resultstocsv(fit, fnames, a, "lc-coeff.csv", tblname="Latent Class Model Coefficients") return (l_0, l_1, ll_ratio), results_d
def lcmnl_estimate(cmdata,numclasses,csdata,numalts,chosen,maxiter=MAXITER,emtol=EMTOL,\ skipprep=False,csbeta=None,cmbeta=None,csfnames=None,cmfnames=None): loglik = -999999 l_0 = None if csbeta is None: csbeta = [np.random.rand(csdata.shape[1]) for i in range(numclasses)] if csfnames is None: csfnames = ['cs%d'%i for i in range(csdata.shape[1])] if cmfnames is None: cmfnames = ['cm%d'%i for i in range(cmdata.shape[1])] if not skipprep: cmdata,cmfnames = prep_cm_data(cmdata,numclasses,cmfnames) if cmbeta is None: cmbeta = np.random.rand(cmdata.shape[1])*10.0-5.0 results_d = {} for i in range(maxiter): print "Running iteration %d" % (i+1) print time.ctime() # EXPECTATION def expectation(cmbeta,csbeta): print "Running class membership model" cmprobs = mnl.mnl_simulate(cmdata,cmbeta,numclasses,GPU=GPU,returnprobs=1) csprobs = [] for cno in range(numclasses): tmp = mnl.mnl_simulate(csdata,csbeta[cno],numalts,GPU=GPU,returnprobs=1) tmp = np.sum(tmp*chosen,axis=1) # keep only chosen probs csprobs.append(np.reshape(tmp,(-1,1))) csprobs = np.concatenate(csprobs,axis=1) h = csprobs * cmprobs loglik = np.sum(np.log(np.sum(h,axis=1))) wts = h / np.reshape(np.sum(h,axis=1),(-1,1)) return loglik, wts oldloglik = loglik loglik, wts = expectation(cmbeta,csbeta) if l_0 is None: l_0 = loglik print "current cmbeta", cmbeta print "current csbeta", csbeta print "current loglik", loglik, i+1, "\n\n" if abs(loglik-oldloglik) < emtol: break # MAXIMIZATION for cno in range(numclasses): print "Estimating class specific model for class %d" % (cno+1) t1 = time.time() weights=np.reshape(wts[:,cno],(-1,1)) fit, results = mnl.mnl_estimate(csdata,chosen,numalts,GPU=GPU,weights=weights,beta=csbeta[cno]) print "Finished in %fs" % (time.time()-t1) csbeta[cno] = zip(*results)[0] results_d['cs%d'%cno] = results print "Estimating class membership model" t1 = time.time() fit, results = mnl.mnl_estimate(cmdata,None,numclasses,GPU=GPU,weights=wts,lcgrad=True, \ beta=cmbeta,coeffrange=(-1000,1000)) print "Finished in %fs" % (time.time()-t1) cmbeta = zip(*results)[0] results_d['cm'] = results l_1 = loglik l_0, foo = expectation(np.zeros(len(cmbeta)),[np.zeros(len(a)) for a in csbeta]) ll_ratio = 1-(l_1/l_0) print "Null Log-liklihood: %f" % l_0 print "Log-liklihood at convergence: %f" % l_1 print "Log-liklihood ratio: %f" % ll_ratio a = [] fnames = [] fnames += cmfnames a += results_d['cm'] for i in range(numclasses): fnames += ['%s cls%d'%(s,i) for s in csfnames] a += results_d['cs%d'%i] print misc.resultstotable(fnames,a) fit = (l_0,l_1,ll_ratio) misc.resultstocsv(fit,fnames,a,"lc-coeff.csv",tblname="Latent Class Model Coefficients") return (l_0,l_1,ll_ratio), results_d
def estimate( dset, indvars, depvar="building_id", alternatives=None, SAMPLE_SIZE=100, max_segment_size=1200, estimation_table="households_for_estimation", output_names=None, agents_groupby=["income_3_tenure"], ): ##HCLM ESTIMATION output_csv, output_title, coeff_name, output_varname = output_names choosers = dset.fetch(estimation_table) segments = choosers.groupby(agents_groupby) num_segments = len(segments.size().index) if num_segments != len(indvars): print "ERROR: number of segments does not match number of sets of independent variable" indvar_dict = dict(zip(segments.size().index.values, range(num_segments))) alts = alternatives for name, segment in segments: ind_vars = indvars[indvar_dict[name]] name = str(name) tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv % name, output_title % name, coeff_name % name if len(segment[depvar]) > max_segment_size: # reduce size of segment if too big so things don't bog down segment = segment.ix[np.random.choice(segment.index, max_segment_size, replace=False)] # ,weight_var='residential_units') sample, alternative_sample, est_params = interaction.mnl_interaction_dataset( segment, alts, SAMPLE_SIZE, chosenalts=segment[depvar] ) ##Interaction variables interaction_vars = [(var, var.split("_x_")) for var in ind_vars if "_x_" in var] for ivar in interaction_vars: alternative_sample[ivar[0]] = (alternative_sample[ivar[1][0]]) * alternative_sample[ivar[1][1]] print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) if len(segment.index) > 50: est_data = pd.DataFrame(index=alternative_sample.index) for varname in ind_vars: est_data[varname] = alternative_sample[varname] est_data = est_data.fillna(0) data = est_data.as_matrix() try: fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE) fnames = interaction.add_fnames(ind_vars, est_params) print misc.resultstotable(fnames, results) misc.resultstocsv(fit, fnames, results, tmp_outcsv, tblname=tmp_outtitle) coefficients = zip(*results)[0] + ( 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, ) varnames = fnames + [ "county8001", "county8005", "county8013", "county8014", "county8019", "county8031", "county8035", "county8039", "county8047", "county8059", "county8123", ] dset.store_coeff(tmp_coeffname, coefficients, varnames) except: print "SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM" else: print "SAMPLE SIZE TOO SMALL"
print segment.reset_index().building_id.describe() alts.index = alts.index.astype('int32') #sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar],weight_var='non_residential_sqft') sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar]) #alternative_sample['paris_x_employees'] = (alternative_sample.in_paris*alternative_sample.employees) print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) if len(segment.index) > 50: est_data = pd.DataFrame(index=alternative_sample.index) for varname in ind_vars: est_data[varname] = alternative_sample[varname] est_data = est_data.fillna(0) data = est_data.as_matrix() try: fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE) #print fit #print results fnames = interaction.add_fnames(ind_vars,est_params) print misc.resultstotable(fnames,results) misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle) dset.store_coeff(tmp_coeffname,zip(*results)[0],fnames) except: print 'SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM' else: print 'SAMPLE SIZE TOO SMALL' print dset.coeffs[('emp_location_6','coeffs')] print dset.coeffs[('emp_location_6','coeffs')][0] print dset.coeffs[('emp_location_6','coeffs')][1] print dset.coeffs[('emp_location_6','coeffs')][2] print dset.coeffs[('emp_location_6','coeffs')][3] print dset.coeffs[('emp_location_6','coeffs')][4] print dset.coeffs[('emp_location_6','coeffs')][5]