예제 #1
0
    def setUp(self):
        self.rules_dict = {
            'x_0_10':
            lambda data, model: 0,
            'x_10_20':
            lambda data, model: 0,
            'y_A':
            lambda data, model: 1 if data['y'] == 'A' else 0,
            'y_B':
            lambda data, model: 1 if data['y'] == 'B' else 0,
            'y_C':
            lambda data, model: 1 if data['y'] == 'C' else 0,
            'log_z':
            lambda data, model: math.log(data['z'])
            if 0 < data['z'] <= 100 else math.log(100)
        }

        self.obs = {'x': 8, 'y': 'B', 'z': 16.83}

        model_dict = {
            'feature':
            ['(Intercept)', 'x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'],
            'coefficient': [9.034, 0.12, 0.342, -1.343, 3.56, 0.92, -0.45],
            'lower': [numpy.NaN, 0, 10, numpy.NaN, numpy.NaN, numpy.NaN, 1],
            'upper': [numpy.NaN, 10, 20, numpy.NaN, numpy.NaN, numpy.NaN, 100]
        }
        self.model_data = pandas.DataFrame(model_dict)
        self.model = GLMModel(self.model_data)
예제 #2
0
 def test_duplicate_feature(self):
     self.model_data.loc[len(self.model_data)] = {
         'feature': 'x_0_10',
         'coefficient': 0.435,
         'lower': numpy.NaN,
         'upper': numpy.NaN
     }
     with self.assertRaises(ValueError):
         GLMModel(self.model_data)
예제 #3
0
 def test_init_missing_coefficient(self):
     with self.assertRaises(AttributeError):
         GLMModel(self.model_data.drop('coefficient', axis=1))
예제 #4
0
 def test_init_missing_features(self):
     with self.assertRaises(AttributeError):
         GLMModel(self.model_data.drop('feature', axis=1))
예제 #5
0
 def test_init(self):
     self.assertIsNotNone(GLMModel(self.model_data))
예제 #6
0
class test_glm_model(unittest.TestCase):

    rules_dict = None
    data = None
    coeff = None

    def setUp(self):
        self.rules_dict = {
            'x_0_10':
            lambda data, model: 0,
            'x_10_20':
            lambda data, model: 0,
            'y_A':
            lambda data, model: 1 if data['y'] == 'A' else 0,
            'y_B':
            lambda data, model: 1 if data['y'] == 'B' else 0,
            'y_C':
            lambda data, model: 1 if data['y'] == 'C' else 0,
            'log_z':
            lambda data, model: math.log(data['z'])
            if 0 < data['z'] <= 100 else math.log(100)
        }

        self.obs = {'x': 8, 'y': 'B', 'z': 16.83}

        model_dict = {
            'feature':
            ['(Intercept)', 'x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'],
            'coefficient': [9.034, 0.12, 0.342, -1.343, 3.56, 0.92, -0.45],
            'lower': [numpy.NaN, 0, 10, numpy.NaN, numpy.NaN, numpy.NaN, 1],
            'upper': [numpy.NaN, 10, 20, numpy.NaN, numpy.NaN, numpy.NaN, 100]
        }
        self.model_data = pandas.DataFrame(model_dict)
        self.model = GLMModel(self.model_data)

    def test_init(self):
        self.assertIsNotNone(GLMModel(self.model_data))

    def test_init_missing_features(self):
        with self.assertRaises(AttributeError):
            GLMModel(self.model_data.drop('feature', axis=1))

    def test_init_missing_coefficient(self):
        with self.assertRaises(AttributeError):
            GLMModel(self.model_data.drop('coefficient', axis=1))

    def test_duplicate_feature(self):
        self.model_data.loc[len(self.model_data)] = {
            'feature': 'x_0_10',
            'coefficient': 0.435,
            'lower': numpy.NaN,
            'upper': numpy.NaN
        }
        with self.assertRaises(ValueError):
            GLMModel(self.model_data)

    def test_load_rules(self):
        self.model.load_rules(self.rules_dict)
        self.assertGreater(len(self.model._GLMModel__rules), 0)

    def test_load_rules_bad_key(self):
        self.rules_dict['log_x'] = None
        with self.assertRaises(ValueError):
            self.model.load_rules(self.rules_dict)

    def test_load_rules_bad_value(self):
        self.rules_dict['log_z'] = None
        with self.assertRaises(ValueError):
            self.model.load_rules(self.rules_dict)

    def test_load_rules_bad_value_2(self):
        self.rules_dict['log_z'] = lambda data: data['z'] + 1
        with self.assertRaises(ValueError):
            self.model.load_rules(self.rules_dict)

    def test_create_rule(self):
        rule = lambda data, model: math.log(data['x']) if model[
            'lower'] < data['x'] <= model['upper'] else 0
        self.model.create_rule('x_0_10', rule)
        self.assertIs(self.model._GLMModel__rules['x_0_10'], rule)

    def test_create_rule_bad_feature(self):
        rule = lambda data, model: 0
        with self.assertRaises(ValueError):
            self.model.create_rule('a', rule)

    def test_create_rule_bad_rule(self):
        rule = lambda data: 0
        with self.assertRaises(ValueError):
            self.model.create_rule('x_0_10', rule)

    def test_prep_data_and_score(self):
        rule = lambda data, model: data['x'] if model['lower'] < data[
            'x'] <= model['upper'] else 0
        self.model.create_rule('x_0_10', rule)
        self.model.create_rule('x_10_20', rule)
        score, scoring_data = self.model.prep_data_and_score(self.obs)
        self.assertAlmostEquals(score, 12.283576646308779)
        self.assertEquals(len(scoring_data[scoring_data['contribution'] != 0]),
                          4)

    def test_prep_data_and_score_no_rules(self):
        self.model._GLMModel__rules = dict()
        with self.assertRaises(ValueError):
            self.model.prep_data_and_score(self.obs)

    def test_score_data(self):
        obs = pandas.DataFrame({
            'feature': ['x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'],
            'xi': [0, 13, 1, 0, 0, math.log(100)]
        })
        score, scoring_data = self.model.score_data(obs)
        self.assertAlmostEquals(score, 10.064673416305359)
        self.assertEquals(len(scoring_data[scoring_data['contribution'] != 0]),
                          4)

    def test_score_data_duplicate_features(self):
        obs = pandas.DataFrame({
            'feature': ['x_0_10', 'x_0_10', 'y_A', 'y_B', 'y_C', 'log_z'],
            'xi': [0, 13, 1, 0, 0, math.log(100)]
        })
        with self.assertRaises(ValueError):
            self.model.score_data(obs)

    def test_score_data_missing_features(self):
        obs = pandas.DataFrame({
            'feature': ['x_0_10', 'x_0_10', 'y_A', 'y_B', 'log_z'],
            'xi': [0, 13, 1, 0, math.log(100)]
        })
        with self.assertRaises(ValueError):
            self.model.score_data(obs)
예제 #7
0
파일: al_gc.py 프로젝트: imperialguy/algc
def run_model(
        model_inputs,
        model_covariances_dataframe,
        model_coefficients_dataframe,
        eazi_dataframe,
        density_constant,
        rules_dict,
        custom_rule_features,
        calculate_sigma_squared=False):
    """ Runs the ALGC GLM model to return the fitted and sigma squared results

    :param model_inputs: model inputs
    :param model_covariances_dataframe: model covariances loaded into pandas
    :param model_coefficients_dataframe: model coefficients loaded into pandas
    :param eazi_dataframe: eazi csv loaded into pandas
    :param density_constant: density constant used to calculate log density
    :param rules_dict: rules configuration for parameters/frequency models
    :param custom_rule_features: rules that are overriden by custom functions
    :param calculate_sigma_squared: flag to enable sigma squared calculation
    :type model_inputs: dict
    :type model_covariances_dataframe: pandas.DataFrame
    :type model_coefficients_dataframe: pandas.DataFrame
    :type eazi_dataframe: pandas.DataFrame
    :type density_constant: int
    :type rules_dict: dict
    :type custom_rule_features: list
    :type calculate_sigma_squared: bool
    :returns: fitted and sigma squared results for frequency/parameters models
    :rtype: tuple

    """
    if not isinstance(model_inputs, dict):
        raise TypeError('model_inputs should be a dictionary')

    if not isinstance(model_covariances_dataframe, pandas.DataFrame):
        raise TypeError(
            'model_covariances_dataframe should be a Pandas DataFrame')

    if not isinstance(model_coefficients_dataframe, pandas.DataFrame):
        raise TypeError(
            'model_coefficients_dataframe should be a Pandas DataFrame')

    if not isinstance(eazi_dataframe, pandas.DataFrame):
        raise TypeError(
            'eazi_dataframe should be a Pandas DataFrame')

    if not isinstance(density_constant, int):
        raise TypeError('density_constant should be an integer')

    if not isinstance(rules_dict, dict):
        raise TypeError('rules_dict should be a dictionary')

    if not isinstance(custom_rule_features, list):
        raise TypeError('custom_rule_features should be a list')

    if not isinstance(calculate_sigma_squared, bool):
        raise TypeError('calculate_sigma_squared should be a boolean')

    algc_glm_model = GLMModel(model_coefficients_dataframe)
    algc_glm_model.load_rules(rules_dict)

    for feature in custom_rule_features:
        algc_glm_model = create_rule(algc_glm_model, feature,
                                     eazi_dataframe, density_constant)

    predicted_loss, scoring_data = algc_glm_model.prep_data_and_score(
        model_inputs)

    fitted_result = numpy.exp(scoring_data.coefficient.dot(
        scoring_data.xi))

    sigma_squared_result = scoring_data.xi.dot(
        model_covariances_dataframe).dot(scoring_data.xi
                                         ) if calculate_sigma_squared else None

    return (fitted_result, sigma_squared_result)
예제 #8
0
def run_model(model_inputs, model_coefficients_filename, rules_dict):
    """Runs the model based on the provided inputs

    Builds a GLMModel object from the external coefficients, loads the rules
    to convert apply the model coefficients based on the inputs and then runs
    the model based on the inputs provided.

    Args:
        **model_inputs**: A dictionary or DataFrame containing the variables
        required by the model as keys\n
        **model_coefficients_filename**: Path to file containing the model
        coefficients for the Worker's Comp GC model\n
        **rules_dict**: Dictionary with lambda functions to derive the features
        used by the model from the input variables

    Return:
        The predicted loss ratio for the account
    """
    gl_gc_model = GLMModel(pandas.read_csv(model_coefficients_filename))
    gl_gc_model.load_rules(rules_dict)
    gl_gc_model.create_rule('log_l_mean_clm_cnt_123', transform_variable('avg_claim_count'))
    gl_gc_model.create_rule('log_density', transform_variable('zip_density'))
    if model_inputs['exposure_type'] == 'Payroll':
        gl_gc_model.create_rule('payroll_ind_log_payroll_m', transform_variable('exposure_size'))
        gl_gc_model.create_rule('sales_ind_log_sales_m', lambda data, model: 0)
    else:
        gl_gc_model.create_rule('payroll_ind_log_payroll_m', lambda data, model: 0)
        gl_gc_model.create_rule('sales_ind_log_sales_m', transform_variable('exposure_size'))
    return gl_gc_model.prep_data_and_score(model_inputs)