示例#1
0
 def interaction_columns_used(self):
     """
     Columns from the interaction dataset used for filtering and in
     the model. These may come originally from either the choosers or
     alternatives tables.
     """
     return list(tz.unique(tz.concatv(
         util.columns_in_filters(self.interaction_predict_filters),
         util.columns_in_formula(self.model_expression))))
示例#2
0
def get_data(tables,
             fallback_tables=None,
             filters=None,
             model_expression=None,
             extra_columns=None):
    """
    Generate a ``pd.DataFrame`` for model estimation or simulation. Automatically loads 
    tables from Orca, merges them, and removes columns not referenced in a model 
    expression or data filter. Additional columns can be requested.
    
    If filters are provided, the output will include only rows that match the filter
    criteria. 
    
    See ``urbansim_templates.utils.merge_tables()`` for a detailed description of how 
    the merges are performed.
    
    Parameters
    ----------
    tables : str or list of str
        Orca table(s) to draw data from.
    
    fallback_tables : str or list of str, optional
        Table(s) to use if first parameter evaluates to `None`. (This option will be 
        removed shortly when estimation and simulation settings are separated.)
    
    filters : str or list of str, optional
        Filter(s) to apply to the merged data, using `pd.DataFrame.query()`.
    
    model_expression : str, optional
        Model expression that will be evaluated using the output data. Only used to drop 
        non-relevant columns. PyLogit format is not yet supported.
    
    extra_columns : str or list of str, optional
        Columns to include, in addition to any in the model expression and filters. (If 
        this and the model_expression are both None, all columns will be included.)

    Returns
    -------
    pd.DataFrame
    
    """
    if tables is None:
        tables = fallback_tables

    colnames = None  # this will get all columns
    if (model_expression is not None) or (extra_columns is not None):
        colnames = list(set(columns_in_formula(model_expression) + \
                            columns_in_filters(filters) + to_list(extra_columns)))

    if not isinstance(tables, list):
        df = get_df(tables, colnames)

    else:
        df = merge_tables(tables, colnames)

    df = apply_filter_query(df, filters)
    return df
示例#3
0
    def _get_data(self, task='fit'):
        """
        DEPRECATED - this should be replaced by the more general utils.get_data()
        
        Generate a data table for estimation or prediction, relying on functionality from
        Orca and UrbanSim.models.util. This should be performed immediately before 
        estimation or prediction so that it reflects the current data state.
        
        The output includes only the necessary columns: those mentioned in the model
        expression or filters, plus (it appears) the index of each merged table. Relevant 
        filter queries are applied.
        
        Parameters
        ----------
        task : 'fit' or 'predict'
        
        Returns
        -------
        DataFrame
        
        """
        # TO DO - verify input data

        if isinstance(self.model_expression, str):
            expr_cols = util.columns_in_formula(self.model_expression)

        if (task == 'fit'):
            tables = self.tables
            columns = expr_cols + util.columns_in_filters(self.filters)
            filters = self.filters

        elif (task == 'predict'):
            if self.out_tables is not None:
                tables = self.out_tables
            else:
                tables = self.tables

            columns = expr_cols + util.columns_in_filters(self.out_filters)
            if self.out_column is not None:
                columns += [self.out_column]

            filters = self.out_filters

        if isinstance(tables, list):
            df = orca.merge_tables(target=tables[0],
                                   tables=tables,
                                   columns=columns)
        else:
            df = orca.get_table(tables).to_frame(columns)

        df = util.apply_filter_query(df, filters)
        return df
    def run(self, chooser_batch_size=None, interaction_terms=None):
        """
        Run the model step: simulate choices and use them to update an Orca column.

        The simulated choices are saved to the class object for diagnostics. If choices 
        are unconstrained, the choice table and the probabilities of sampled alternatives 
        are saved as well.

        Parameters
        ----------
        chooser_batch_size : int
            This parameter gets passed to 
            choicemodels.tools.simulation.iterative_lottery_choices and is a temporary
            workaround for dealing with memory issues that arise from generating massive
            merged choice tables for simulations that involve large numbers of choosers,
            large numbers of alternatives, and large numbers of predictors. It allows the
            user to specify a batch size for simulating choices one chunk at a time. 

        interaction_terms : pandas.Series, pandas.DataFrame, or list of either, optional
            Additional column(s) of interaction terms whose values depend on the 
            combination of observation and alternative, to be merged onto the final data 
            table. If passed as a Series or DataFrame, it should include a two-level 
            MultiIndex. One level's name and values should match an index or column from 
            the observations table, and the other should match an index or column from the 
            alternatives table. 

        Returns
        -------
        None

        """
        check_choicemodels_version()
        from choicemodels import MultinomialLogit
        from choicemodels.tools import (MergedChoiceTable, monte_carlo_choices,
                                        iterative_lottery_choices)

        # Clear simulation attributes from the class object
        self.mergedchoicetable = None
        self.probabilities = None
        self.choices = None

        if interaction_terms is not None:
            uniq_intx_idx_names = set([
                idx for intx in interaction_terms for idx in intx.index.names
            ])
            obs_extra_cols = to_list(self.chooser_size) + \
                list(uniq_intx_idx_names)
            alts_extra_cols = to_list(
                self.alt_capacity) + list(uniq_intx_idx_names)

        else:
            obs_extra_cols = to_list(self.chooser_size)
            alts_extra_cols = to_list(self.alt_capacity)

        # get any necessary extra columns from the mct intx operations spec
        if self.mct_intx_ops:
            intx_extra_obs_cols = self.mct_intx_ops.get('extra_obs_cols', [])
            intx_extra_obs_cols = to_list(intx_extra_obs_cols)
            obs_extra_cols += intx_extra_obs_cols
            intx_extra_alts_cols = self.mct_intx_ops.get('extra_alts_cols', [])
            intx_extra_alts_cols = to_list(intx_extra_alts_cols)
            alts_extra_cols += intx_extra_alts_cols

        observations = get_data(tables=self.out_choosers,
                                fallback_tables=self.choosers,
                                filters=self.out_chooser_filters,
                                model_expression=self.model_expression,
                                extra_columns=obs_extra_cols)

        if len(observations) == 0:
            print("No valid choosers")
            return

        alternatives = get_data(tables=self.out_alternatives,
                                fallback_tables=self.alternatives,
                                filters=self.out_alt_filters,
                                model_expression=self.model_expression,
                                extra_columns=alts_extra_cols)

        if len(alternatives) == 0:
            print("No valid alternatives")
            return

        # Remove filter columns before merging, in case column names overlap
        expr_cols = columns_in_formula(self.model_expression)

        obs_cols = set(
            observations.columns) & set(expr_cols + to_list(obs_extra_cols))
        observations = observations[list(obs_cols)]

        alt_cols = set(
            alternatives.columns) & set(expr_cols + to_list(alts_extra_cols))
        alternatives = alternatives[list(alt_cols)]

        # Callables for iterative choices
        def mct(obs, alts, intx_ops=None):

            this_mct = MergedChoiceTable(obs,
                                         alts,
                                         sample_size=self.alt_sample_size,
                                         interaction_terms=interaction_terms)

            if intx_ops:
                this_mct = self.perform_mct_intx_ops(this_mct)
                this_mct.sample_size = self.alt_sample_size

            return this_mct

        def probs(mct):
            return self.model.probabilities(mct)

        if self.constrained_choices is True:
            choices = iterative_lottery_choices(
                observations,
                alternatives,
                mct_callable=mct,
                probs_callable=probs,
                alt_capacity=self.alt_capacity,
                chooser_size=self.chooser_size,
                max_iter=self.max_iter,
                chooser_batch_size=chooser_batch_size,
                mct_intx_ops=self.mct_intx_ops)

        else:
            choicetable = mct(observations,
                              alternatives,
                              intx_ops=self.mct_intx_ops)
            probabilities = probs(choicetable)
            choices = monte_carlo_choices(probabilities)

            # Save data to class object if available
            self.mergedchoicetable = choicetable
            self.probabilities = probabilities

        # Save choices to class object for diagnostics
        self.choices = choices

        # Update Orca
        update_column(table=self.out_choosers,
                      fallback_table=self.choosers,
                      column=self.out_column,
                      fallback_column=self.choice_column,
                      data=choices)
示例#5
0
def get_data(tables,
             fallback_tables=None,
             filters=None,
             model_expression=None,
             extra_columns=None):
    """
    Generate a pd.DataFrame from one or more tables registered with Orca. Templates should 
    call this function immediately before the data is needed, so that it's as up-to-date 
    as possible.
    
    If filters are provided, the output will include only rows that match the filter
    criteria. 
    
    Default behavior is for the output to inclue all columns. If a model_expression and/or
    extra_columns is provided, non-relevant columns will be dropped from the output.
    Relevant columns include any mentioned in the model expression, filters, or list of 
    extras. Join keys will *not* be included in the final output even if the data is drawn
    from multiple tables, unless they appear in the model expression or filters as well.
    
    If a named column is not found in the source tables, it will just be skipped. This is 
    to support use cases where data is assembled separately for choosers and alternatives 
    and then merged together -- the model expression would include terms from both sets 
    of tables.
    
    Duplicate column names are not recommended -- columns are expected to be unique within 
    the set of tables they're being drawn from, with the exception of join keys. If column 
    names are repeated, current behavior is to follow the Orca default and keep the 
    left-most copy of the column. This may change later and should not be relied on. 
    
    Parameters
    ----------
    tables : str or list of str
        Orca table(s) to draw data from.
    
    fallback_tables : str or list of str, optional
        Table(s) to use if first parameter evaluates to `None`. (This option will be 
        removed shortly when estimation and simulation settings are separated.)
    
    filters : str or list of str, optional
        Filter(s) to apply to the merged data, using `pd.DataFrame.query()`.
    
    model_expression : str, optional
        Model expression that will be evaluated using the output data. Only used to drop 
        non-relevant columns. PyLogit format is not yet supported.
    
    extra_columns : str or list of str, optional
        Columns to include, in addition to any in the model expression and filters. (If 
        this and the model_expression are both None, all columns will be included.)

    Returns
    -------
    pd.DataFrame
    
    """
    if tables is None:
        tables = fallback_tables

    tables = to_list(tables)
    colnames = None  # this will get all columns from Orca utilities

    if (model_expression is not None) or (extra_columns is not None):
        colnames = set(columns_in_formula(model_expression) + \
                       columns_in_filters(filters) + to_list(extra_columns))

        # skip cols not found in any of the source tables - have to check for this
        # explicitly because the orca utilities will raise an error if we request column
        # names that aren't there
        all_cols = []
        for t in tables:
            dfw = orca.get_table(t)
            all_cols += list(dfw.index.names) + list(dfw.columns)

        colnames = [c for c in colnames if c in all_cols]

    if len(tables) == 1:
        df = orca.get_table(table_name=tables[0]).to_frame(columns=colnames)

    else:
        df = orca.merge_tables(target=tables[0],
                               tables=tables,
                               columns=colnames)

    if colnames is not None:
        if len(df.columns) > len(colnames):
            df = df[colnames]

    df = apply_filter_query(df, filters)
    return df
示例#6
0
    def _get_data(self, task='fit'):
        """
        DEPRECATED - this should be replaced by the more general _get_df()
        
        Generate a data table for estimation or prediction, relying on functionality from
        Orca and UrbanSim.models.util. This should be performed immediately before 
        estimation or prediction so that it reflects the current data state.
        
        The output includes only the necessary columns: those mentioned in the model
        expression or filters, plus (it appears) the index of each merged table. Relevant 
        filter queries are applied.
        
        Parameters
        ----------
        task : 'fit' or 'predict'
        
        Returns
        -------
        DataFrame
        
        """
        # TO DO - verify input data

        if isinstance(self.model_expression, str):
            expr_cols = util.columns_in_formula(self.model_expression)

        # This is for PyLogit model expressions
        elif isinstance(self.model_expression, OrderedDict):
            # TO DO - check that this works in Python 2.7
            expr_cols = [t[0] for t in list(self.model_expression.items()) \
                         if t[0] is not 'intercept']
            # TO DO - not very general, maybe we should just override the method
            # TO DO - and this only applies to the fit condition
            if self.choice_column is not None:
                expr_cols += [self.choice_column]

        if (task == 'fit'):
            tables = self.tables
            columns = expr_cols + util.columns_in_filters(self.filters)
            filters = self.filters

        elif (task == 'predict'):
            if self.out_tables is not None:
                tables = self.out_tables
            else:
                tables = self.tables

            columns = expr_cols + util.columns_in_filters(self.out_filters)
            if self.out_column is not None:
                columns += [self.out_column]

            filters = self.out_filters

        if isinstance(tables, list):
            df = orca.merge_tables(target=tables[0],
                                   tables=tables,
                                   columns=columns)
        else:
            df = orca.get_table(tables).to_frame(columns)

        df = util.apply_filter_query(df, filters)
        return df