Exemplo n.º 1
0
def estimate(dset,config,year,show=True,variables=None):

  choosers = fetch_table(dset,config)
  if 'est_sample_size' in config: 
    choosers = choosers.ix[np.random.choice(choosers.index, config['est_sample_size'],replace=False)]
  output_csv, output_title, coeff_name, output_varname = config["output_names"]
 
  assert 'alternatives' in config
  alternatives = eval(config['alternatives'])
  alternatives = merge(dset,alternatives,config)

  t1 = time.time()

  segments = [(None,choosers)]
  if 'segment' in config:
    for varname in config['segment']:
      if varname not in choosers.columns:
        choosers[varname] = calcvar(choosers,config,dset,varname)
    segments = choosers.groupby(config['segment'])
  for name, segment in segments:

    name = str(name)
    if name is not None: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv%name, output_title%name, coeff_name%name
    else: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv, output_title, coeff_name

    assert "dep_var" in config
    depvar = config["dep_var"]
    global SAMPLE_SIZE
    SAMPLE_SIZE = config["alt_sample_size"] if "alt_sample_size" in config else SAMPLE_SIZE 
    sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(
                                        segment,alternatives,SAMPLE_SIZE,chosenalts=segment[depvar])

    print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) 

    data = spec(alternative_sample,config,submodel=name)
    if show: print data.describe()
    data = data.as_matrix()
    
    fnames = config['ind_vars']
    fnames = config['ind_var_names'] if 'ind_var_names' in config else fnames

    fit, results = interaction.estimate(data,est_params,SAMPLE_SIZE)
    
    fnames = interaction.add_fnames(fnames,est_params)
    if show: print misc.resultstotable(fnames,results)
    misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle)
    dset.store_coeff(tmp_coeffname,zip(*results)[0],fnames)

  print "Finished executing in %f seconds" % (time.time()-t1)
Exemplo n.º 2
0
def estimate(dset,config,year,show=True,variables=None):

  choosers = fetch_table(dset,config)
  if 'est_sample_size' in config: 
    choosers = choosers.ix[np.random.choice(choosers.index, config['est_sample_size'],replace=False)]
  output_csv, output_title, coeff_name, output_varname = config["output_names"]
 
  assert 'alternatives' in config
  alternatives = eval(config['alternatives'])
  alternatives = merge(dset,alternatives,config)

  t1 = time.time()

  segments = [(None,choosers)]
  if 'segment' in config:
    for varname in config['segment']:
      if varname not in choosers.columns:
        choosers[varname] = calcvar(choosers,config,dset,varname)
    segments = choosers.groupby(config['segment'])
  for name, segment in segments:
    name = str(name)
    if name is not None: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv%name, output_title%name, coeff_name%name
    else: tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv, output_title, coeff_name

    assert "dep_var" in config
    depvar = config["dep_var"]
    global SAMPLE_SIZE
    SAMPLE_SIZE = config["alt_sample_size"] if "alt_sample_size" in config else SAMPLE_SIZE 
    sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(
                                        segment,alternatives,SAMPLE_SIZE,chosenalts=segment[depvar])

    print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) 

    data = spec(alternative_sample,config,submodel=name)
    if show: print data.describe()
    data = data.as_matrix()
    
    fnames = config['ind_vars']
    fnames = config['ind_var_names'] if 'ind_var_names' in config else fnames

    fit, results = interaction.estimate(data,est_params,SAMPLE_SIZE)
    
    fnames = interaction.add_fnames(fnames,est_params)
    if show: print misc.resultstotable(fnames,results)
    misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle)
    dset.store_coeff(tmp_coeffname,zip(*results)[0],fnames)

  print "Finished executing in %f seconds" % (time.time()-t1)
Exemplo n.º 3
0
def estimate (dset,indvars,depvar = 'building_id',alternatives=None,SAMPLE_SIZE=100,max_segment_size = 1200,estimation_table = 'households_for_estimation',
              output_names=None,agents_groupby = ['income_3_tenure',]):
    ##HCLM ESTIMATION
    output_csv, output_title, coeff_name, output_varname = output_names
    choosers = dset.fetch(estimation_table)

    segments = choosers.groupby(agents_groupby)
    num_segments = len(segments.size().index)
    if num_segments != len(indvars):
        print "ERROR: number of segments does not match number of sets of independent variable"
    indvar_dict = dict(zip(segments.size().index.values,range(num_segments)))
    alts = alternatives
    for name, segment in segments:
        ind_vars = indvars[indvar_dict[name]]
        name = str(name)
        tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv%name, output_title%name, coeff_name%name
        if len(segment[depvar]) > max_segment_size: #reduce size of segment if too big so things don't bog down
            segment = segment.ix[np.random.choice(segment.index, max_segment_size,replace=False)]
        #,weight_var='residential_units')
        sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar])
        ##Interaction variables
        interaction_vars = [(var, var.split('_x_')) for var in ind_vars if '_x_' in var]
        for ivar in interaction_vars:
            alternative_sample[ivar[0]] = ((alternative_sample[ivar[1][0]])*alternative_sample[ivar[1][1]])

        print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) 
        if len(segment.index) > 50:
            est_data = pd.DataFrame(index=alternative_sample.index)
            for varname in ind_vars:
                est_data[varname] = alternative_sample[varname]
            est_data = est_data.fillna(0)
            data = est_data.as_matrix()
            try:
                fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE)
                fnames = interaction.add_fnames(ind_vars,est_params)
                print misc.resultstotable(fnames,results)
                misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle)
                coefficients = zip(*results)[0]+(.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,.0001,)
                varnames = fnames+['county8001','county8005','county8013','county8014','county8019','county8031','county8035','county8039','county8047','county8059','county8123']
                dset.store_coeff(tmp_coeffname,coefficients,varnames)
            except:
                print 'SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM'
        else:
            print 'SAMPLE SIZE TOO SMALL'
Exemplo n.º 4
0
def lcmnl_estimate(cmdata,numclasses,csdata,numalts,chosen,maxiter=MAXITER,emtol=EMTOL,\
                     skipprep=False,csbeta=None,cmbeta=None,csfnames=None,cmfnames=None):

    loglik = -999999
    l_0 = None
    if csbeta is None:
        csbeta = [np.random.rand(csdata.shape[1]) for i in range(numclasses)]
    if csfnames is None:
        csfnames = ['cs%d' % i for i in range(csdata.shape[1])]
    if cmfnames is None:
        cmfnames = ['cm%d' % i for i in range(cmdata.shape[1])]
    if not skipprep:
        cmdata, cmfnames = prep_cm_data(cmdata, numclasses, cmfnames)
    if cmbeta is None: cmbeta = np.random.rand(cmdata.shape[1]) * 10.0 - 5.0
    results_d = {}

    for i in range(maxiter):
        print "Running iteration %d" % (i + 1)
        print time.ctime()

        # EXPECTATION
        def expectation(cmbeta, csbeta):
            print "Running class membership model"
            cmprobs = mnl.mnl_simulate(cmdata,
                                       cmbeta,
                                       numclasses,
                                       GPU=GPU,
                                       returnprobs=1)

            csprobs = []
            for cno in range(numclasses):
                tmp = mnl.mnl_simulate(csdata,
                                       csbeta[cno],
                                       numalts,
                                       GPU=GPU,
                                       returnprobs=1)
                tmp = np.sum(tmp * chosen, axis=1)  # keep only chosen probs
                csprobs.append(np.reshape(tmp, (-1, 1)))
            csprobs = np.concatenate(csprobs, axis=1)

            h = csprobs * cmprobs
            loglik = np.sum(np.log(np.sum(h, axis=1)))
            wts = h / np.reshape(np.sum(h, axis=1), (-1, 1))
            return loglik, wts

        oldloglik = loglik
        loglik, wts = expectation(cmbeta, csbeta)
        if l_0 is None: l_0 = loglik
        print "current cmbeta", cmbeta
        print "current csbeta", csbeta
        print "current loglik", loglik, i + 1, "\n\n"
        if abs(loglik - oldloglik) < emtol: break

        # MAXIMIZATION

        for cno in range(numclasses):
            print "Estimating class specific model for class %d" % (cno + 1)
            t1 = time.time()
            weights = np.reshape(wts[:, cno], (-1, 1))
            fit, results = mnl.mnl_estimate(csdata,
                                            chosen,
                                            numalts,
                                            GPU=GPU,
                                            weights=weights,
                                            beta=csbeta[cno])
            print "Finished in %fs" % (time.time() - t1)
            csbeta[cno] = zip(*results)[0]
            results_d['cs%d' % cno] = results

        print "Estimating class membership model"
        t1 = time.time()
        fit, results = mnl.mnl_estimate(cmdata,None,numclasses,GPU=GPU,weights=wts,lcgrad=True, \
                                                 beta=cmbeta,coeffrange=(-1000,1000))
        print "Finished in %fs" % (time.time() - t1)
        cmbeta = zip(*results)[0]
        results_d['cm'] = results

    l_1 = loglik
    l_0, foo = expectation(np.zeros(len(cmbeta)),
                           [np.zeros(len(a)) for a in csbeta])
    ll_ratio = 1 - (l_1 / l_0)

    print "Null Log-liklihood: %f" % l_0
    print "Log-liklihood at convergence: %f" % l_1
    print "Log-liklihood ratio: %f" % ll_ratio

    a = []
    fnames = []
    fnames += cmfnames
    a += results_d['cm']
    for i in range(numclasses):
        fnames += ['%s cls%d' % (s, i) for s in csfnames]
        a += results_d['cs%d' % i]

    print misc.resultstotable(fnames, a)
    fit = (l_0, l_1, ll_ratio)
    misc.resultstocsv(fit,
                      fnames,
                      a,
                      "lc-coeff.csv",
                      tblname="Latent Class Model Coefficients")

    return (l_0, l_1, ll_ratio), results_d
Exemplo n.º 5
0
def lcmnl_estimate(cmdata,numclasses,csdata,numalts,chosen,maxiter=MAXITER,emtol=EMTOL,\
                     skipprep=False,csbeta=None,cmbeta=None,csfnames=None,cmfnames=None):

  loglik = -999999
  l_0 = None
  if csbeta is None: csbeta = [np.random.rand(csdata.shape[1]) for i in range(numclasses)]
  if csfnames is None: csfnames = ['cs%d'%i for i in range(csdata.shape[1])]
  if cmfnames is None: cmfnames = ['cm%d'%i for i in range(cmdata.shape[1])]
  if not skipprep: cmdata,cmfnames = prep_cm_data(cmdata,numclasses,cmfnames)
  if cmbeta is None: cmbeta = np.random.rand(cmdata.shape[1])*10.0-5.0
  results_d = {}
  
  for i in range(maxiter):
    print "Running iteration %d" % (i+1)
    print time.ctime()

    # EXPECTATION
    def expectation(cmbeta,csbeta):
      print "Running class membership model"
      cmprobs = mnl.mnl_simulate(cmdata,cmbeta,numclasses,GPU=GPU,returnprobs=1)

      csprobs = []
      for cno in range(numclasses):
        tmp = mnl.mnl_simulate(csdata,csbeta[cno],numalts,GPU=GPU,returnprobs=1)
        tmp = np.sum(tmp*chosen,axis=1) # keep only chosen probs
        csprobs.append(np.reshape(tmp,(-1,1)))
      csprobs = np.concatenate(csprobs,axis=1)

      h = csprobs * cmprobs
      loglik = np.sum(np.log(np.sum(h,axis=1)))
      wts = h / np.reshape(np.sum(h,axis=1),(-1,1))
      return loglik, wts

    oldloglik = loglik
    loglik, wts = expectation(cmbeta,csbeta)
    if l_0 is None: l_0 = loglik
    print "current cmbeta", cmbeta
    print "current csbeta", csbeta
    print "current loglik", loglik, i+1, "\n\n"
    if abs(loglik-oldloglik) < emtol: break
   
    # MAXIMIZATION

    for cno in range(numclasses):
      print "Estimating class specific model for class %d" % (cno+1)
      t1 =  time.time()
      weights=np.reshape(wts[:,cno],(-1,1))
      fit, results  = mnl.mnl_estimate(csdata,chosen,numalts,GPU=GPU,weights=weights,beta=csbeta[cno])
      print "Finished in %fs" % (time.time()-t1)
      csbeta[cno] = zip(*results)[0]
      results_d['cs%d'%cno] = results
    
    print "Estimating class membership model"
    t1 =  time.time()
    fit, results = mnl.mnl_estimate(cmdata,None,numclasses,GPU=GPU,weights=wts,lcgrad=True, \
                                             beta=cmbeta,coeffrange=(-1000,1000))
    print "Finished in %fs" % (time.time()-t1)
    cmbeta = zip(*results)[0]
    results_d['cm'] = results 
 
  l_1 = loglik 
  l_0, foo = expectation(np.zeros(len(cmbeta)),[np.zeros(len(a)) for a in csbeta])
  ll_ratio = 1-(l_1/l_0)
  
  print "Null Log-liklihood: %f" % l_0
  print "Log-liklihood at convergence: %f" % l_1
  print "Log-liklihood ratio: %f" % ll_ratio

  a = []
  fnames = []
  fnames += cmfnames
  a += results_d['cm']
  for i in range(numclasses):
    fnames += ['%s cls%d'%(s,i) for s in csfnames]
    a += results_d['cs%d'%i]

  print misc.resultstotable(fnames,a)
  fit = (l_0,l_1,ll_ratio)
  misc.resultstocsv(fit,fnames,a,"lc-coeff.csv",tblname="Latent Class Model Coefficients")

  return (l_0,l_1,ll_ratio), results_d
Exemplo n.º 6
0
def estimate(
    dset,
    indvars,
    depvar="building_id",
    alternatives=None,
    SAMPLE_SIZE=100,
    max_segment_size=1200,
    estimation_table="households_for_estimation",
    output_names=None,
    agents_groupby=["income_3_tenure"],
):
    ##HCLM ESTIMATION
    output_csv, output_title, coeff_name, output_varname = output_names
    choosers = dset.fetch(estimation_table)

    segments = choosers.groupby(agents_groupby)
    num_segments = len(segments.size().index)
    if num_segments != len(indvars):
        print "ERROR: number of segments does not match number of sets of independent variable"
    indvar_dict = dict(zip(segments.size().index.values, range(num_segments)))
    alts = alternatives
    for name, segment in segments:
        ind_vars = indvars[indvar_dict[name]]
        name = str(name)
        tmp_outcsv, tmp_outtitle, tmp_coeffname = output_csv % name, output_title % name, coeff_name % name
        if len(segment[depvar]) > max_segment_size:  # reduce size of segment if too big so things don't bog down
            segment = segment.ix[np.random.choice(segment.index, max_segment_size, replace=False)]
        # ,weight_var='residential_units')
        sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(
            segment, alts, SAMPLE_SIZE, chosenalts=segment[depvar]
        )
        ##Interaction variables
        interaction_vars = [(var, var.split("_x_")) for var in ind_vars if "_x_" in var]
        for ivar in interaction_vars:
            alternative_sample[ivar[0]] = (alternative_sample[ivar[1][0]]) * alternative_sample[ivar[1][1]]

        print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index))
        if len(segment.index) > 50:
            est_data = pd.DataFrame(index=alternative_sample.index)
            for varname in ind_vars:
                est_data[varname] = alternative_sample[varname]
            est_data = est_data.fillna(0)
            data = est_data.as_matrix()
            try:
                fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE)
                fnames = interaction.add_fnames(ind_vars, est_params)
                print misc.resultstotable(fnames, results)
                misc.resultstocsv(fit, fnames, results, tmp_outcsv, tblname=tmp_outtitle)
                coefficients = zip(*results)[0] + (
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                    0.0001,
                )
                varnames = fnames + [
                    "county8001",
                    "county8005",
                    "county8013",
                    "county8014",
                    "county8019",
                    "county8031",
                    "county8035",
                    "county8039",
                    "county8047",
                    "county8059",
                    "county8123",
                ]
                dset.store_coeff(tmp_coeffname, coefficients, varnames)
            except:
                print "SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM"
        else:
            print "SAMPLE SIZE TOO SMALL"
Exemplo n.º 7
0
    print segment.reset_index().building_id.describe()
    alts.index = alts.index.astype('int32')
    #sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar],weight_var='non_residential_sqft')
    sample, alternative_sample, est_params = interaction.mnl_interaction_dataset(segment,alts,SAMPLE_SIZE,chosenalts=segment[depvar])
    #alternative_sample['paris_x_employees'] = (alternative_sample.in_paris*alternative_sample.employees)
    print "Estimating parameters for segment = %s, size = %d" % (name, len(segment.index)) 
    if len(segment.index) > 50:
        est_data = pd.DataFrame(index=alternative_sample.index)
        for varname in ind_vars:
            est_data[varname] = alternative_sample[varname]
        est_data = est_data.fillna(0)
        data = est_data.as_matrix()
        try:
            fit, results = interaction.estimate(data, est_params, SAMPLE_SIZE)
            #print fit
            #print results
            fnames = interaction.add_fnames(ind_vars,est_params)
            print misc.resultstotable(fnames,results)
            misc.resultstocsv(fit,fnames,results,tmp_outcsv,tblname=tmp_outtitle)
            dset.store_coeff(tmp_coeffname,zip(*results)[0],fnames)
        except:
            print 'SINGULAR MATRIX OR OTHER DATA/ESTIMATION PROBLEM'
    else:
        print 'SAMPLE SIZE TOO SMALL'
print dset.coeffs[('emp_location_6','coeffs')]
print dset.coeffs[('emp_location_6','coeffs')][0]
print dset.coeffs[('emp_location_6','coeffs')][1]
print dset.coeffs[('emp_location_6','coeffs')][2]
print dset.coeffs[('emp_location_6','coeffs')][3]
print dset.coeffs[('emp_location_6','coeffs')][4]
print dset.coeffs[('emp_location_6','coeffs')][5]