def setUp(self): self.rules_dict = { 'x_0_10': lambda data, model: 0, 'x_10_20': lambda data, model: 0, 'y_A': lambda data, model: 1 if data['y'] == 'A' else 0, 'y_B': lambda data, model: 1 if data['y'] == 'B' else 0, 'y_C': lambda data, model: 1 if data['y'] == 'C' else 0, 'log_z': lambda data, model: math.log(data['z']) if 0 < data['z'] <= 100 else math.log(100) } self.obs = {'x': 8, 'y': 'B', 'z': 16.83} model_dict = { 'feature': ['(Intercept)', 'x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'], 'coefficient': [9.034, 0.12, 0.342, -1.343, 3.56, 0.92, -0.45], 'lower': [numpy.NaN, 0, 10, numpy.NaN, numpy.NaN, numpy.NaN, 1], 'upper': [numpy.NaN, 10, 20, numpy.NaN, numpy.NaN, numpy.NaN, 100] } self.model_data = pandas.DataFrame(model_dict) self.model = GLMModel(self.model_data)
def test_duplicate_feature(self): self.model_data.loc[len(self.model_data)] = { 'feature': 'x_0_10', 'coefficient': 0.435, 'lower': numpy.NaN, 'upper': numpy.NaN } with self.assertRaises(ValueError): GLMModel(self.model_data)
def test_init_missing_coefficient(self): with self.assertRaises(AttributeError): GLMModel(self.model_data.drop('coefficient', axis=1))
def test_init_missing_features(self): with self.assertRaises(AttributeError): GLMModel(self.model_data.drop('feature', axis=1))
def test_init(self): self.assertIsNotNone(GLMModel(self.model_data))
class test_glm_model(unittest.TestCase): rules_dict = None data = None coeff = None def setUp(self): self.rules_dict = { 'x_0_10': lambda data, model: 0, 'x_10_20': lambda data, model: 0, 'y_A': lambda data, model: 1 if data['y'] == 'A' else 0, 'y_B': lambda data, model: 1 if data['y'] == 'B' else 0, 'y_C': lambda data, model: 1 if data['y'] == 'C' else 0, 'log_z': lambda data, model: math.log(data['z']) if 0 < data['z'] <= 100 else math.log(100) } self.obs = {'x': 8, 'y': 'B', 'z': 16.83} model_dict = { 'feature': ['(Intercept)', 'x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'], 'coefficient': [9.034, 0.12, 0.342, -1.343, 3.56, 0.92, -0.45], 'lower': [numpy.NaN, 0, 10, numpy.NaN, numpy.NaN, numpy.NaN, 1], 'upper': [numpy.NaN, 10, 20, numpy.NaN, numpy.NaN, numpy.NaN, 100] } self.model_data = pandas.DataFrame(model_dict) self.model = GLMModel(self.model_data) def test_init(self): self.assertIsNotNone(GLMModel(self.model_data)) def test_init_missing_features(self): with self.assertRaises(AttributeError): GLMModel(self.model_data.drop('feature', axis=1)) def test_init_missing_coefficient(self): with self.assertRaises(AttributeError): GLMModel(self.model_data.drop('coefficient', axis=1)) def test_duplicate_feature(self): self.model_data.loc[len(self.model_data)] = { 'feature': 'x_0_10', 'coefficient': 0.435, 'lower': numpy.NaN, 'upper': numpy.NaN } with self.assertRaises(ValueError): GLMModel(self.model_data) def test_load_rules(self): self.model.load_rules(self.rules_dict) self.assertGreater(len(self.model._GLMModel__rules), 0) def test_load_rules_bad_key(self): self.rules_dict['log_x'] = None with self.assertRaises(ValueError): self.model.load_rules(self.rules_dict) def test_load_rules_bad_value(self): self.rules_dict['log_z'] = None with self.assertRaises(ValueError): self.model.load_rules(self.rules_dict) def test_load_rules_bad_value_2(self): self.rules_dict['log_z'] = lambda data: data['z'] + 1 with self.assertRaises(ValueError): self.model.load_rules(self.rules_dict) def test_create_rule(self): rule = lambda data, model: math.log(data['x']) if model[ 'lower'] < data['x'] <= model['upper'] else 0 self.model.create_rule('x_0_10', rule) self.assertIs(self.model._GLMModel__rules['x_0_10'], rule) def test_create_rule_bad_feature(self): rule = lambda data, model: 0 with self.assertRaises(ValueError): self.model.create_rule('a', rule) def test_create_rule_bad_rule(self): rule = lambda data: 0 with self.assertRaises(ValueError): self.model.create_rule('x_0_10', rule) def test_prep_data_and_score(self): rule = lambda data, model: data['x'] if model['lower'] < data[ 'x'] <= model['upper'] else 0 self.model.create_rule('x_0_10', rule) self.model.create_rule('x_10_20', rule) score, scoring_data = self.model.prep_data_and_score(self.obs) self.assertAlmostEquals(score, 12.283576646308779) self.assertEquals(len(scoring_data[scoring_data['contribution'] != 0]), 4) def test_prep_data_and_score_no_rules(self): self.model._GLMModel__rules = dict() with self.assertRaises(ValueError): self.model.prep_data_and_score(self.obs) def test_score_data(self): obs = pandas.DataFrame({ 'feature': ['x_0_10', 'x_10_20', 'y_A', 'y_B', 'y_C', 'log_z'], 'xi': [0, 13, 1, 0, 0, math.log(100)] }) score, scoring_data = self.model.score_data(obs) self.assertAlmostEquals(score, 10.064673416305359) self.assertEquals(len(scoring_data[scoring_data['contribution'] != 0]), 4) def test_score_data_duplicate_features(self): obs = pandas.DataFrame({ 'feature': ['x_0_10', 'x_0_10', 'y_A', 'y_B', 'y_C', 'log_z'], 'xi': [0, 13, 1, 0, 0, math.log(100)] }) with self.assertRaises(ValueError): self.model.score_data(obs) def test_score_data_missing_features(self): obs = pandas.DataFrame({ 'feature': ['x_0_10', 'x_0_10', 'y_A', 'y_B', 'log_z'], 'xi': [0, 13, 1, 0, math.log(100)] }) with self.assertRaises(ValueError): self.model.score_data(obs)
def run_model( model_inputs, model_covariances_dataframe, model_coefficients_dataframe, eazi_dataframe, density_constant, rules_dict, custom_rule_features, calculate_sigma_squared=False): """ Runs the ALGC GLM model to return the fitted and sigma squared results :param model_inputs: model inputs :param model_covariances_dataframe: model covariances loaded into pandas :param model_coefficients_dataframe: model coefficients loaded into pandas :param eazi_dataframe: eazi csv loaded into pandas :param density_constant: density constant used to calculate log density :param rules_dict: rules configuration for parameters/frequency models :param custom_rule_features: rules that are overriden by custom functions :param calculate_sigma_squared: flag to enable sigma squared calculation :type model_inputs: dict :type model_covariances_dataframe: pandas.DataFrame :type model_coefficients_dataframe: pandas.DataFrame :type eazi_dataframe: pandas.DataFrame :type density_constant: int :type rules_dict: dict :type custom_rule_features: list :type calculate_sigma_squared: bool :returns: fitted and sigma squared results for frequency/parameters models :rtype: tuple """ if not isinstance(model_inputs, dict): raise TypeError('model_inputs should be a dictionary') if not isinstance(model_covariances_dataframe, pandas.DataFrame): raise TypeError( 'model_covariances_dataframe should be a Pandas DataFrame') if not isinstance(model_coefficients_dataframe, pandas.DataFrame): raise TypeError( 'model_coefficients_dataframe should be a Pandas DataFrame') if not isinstance(eazi_dataframe, pandas.DataFrame): raise TypeError( 'eazi_dataframe should be a Pandas DataFrame') if not isinstance(density_constant, int): raise TypeError('density_constant should be an integer') if not isinstance(rules_dict, dict): raise TypeError('rules_dict should be a dictionary') if not isinstance(custom_rule_features, list): raise TypeError('custom_rule_features should be a list') if not isinstance(calculate_sigma_squared, bool): raise TypeError('calculate_sigma_squared should be a boolean') algc_glm_model = GLMModel(model_coefficients_dataframe) algc_glm_model.load_rules(rules_dict) for feature in custom_rule_features: algc_glm_model = create_rule(algc_glm_model, feature, eazi_dataframe, density_constant) predicted_loss, scoring_data = algc_glm_model.prep_data_and_score( model_inputs) fitted_result = numpy.exp(scoring_data.coefficient.dot( scoring_data.xi)) sigma_squared_result = scoring_data.xi.dot( model_covariances_dataframe).dot(scoring_data.xi ) if calculate_sigma_squared else None return (fitted_result, sigma_squared_result)
def run_model(model_inputs, model_coefficients_filename, rules_dict): """Runs the model based on the provided inputs Builds a GLMModel object from the external coefficients, loads the rules to convert apply the model coefficients based on the inputs and then runs the model based on the inputs provided. Args: **model_inputs**: A dictionary or DataFrame containing the variables required by the model as keys\n **model_coefficients_filename**: Path to file containing the model coefficients for the Worker's Comp GC model\n **rules_dict**: Dictionary with lambda functions to derive the features used by the model from the input variables Return: The predicted loss ratio for the account """ gl_gc_model = GLMModel(pandas.read_csv(model_coefficients_filename)) gl_gc_model.load_rules(rules_dict) gl_gc_model.create_rule('log_l_mean_clm_cnt_123', transform_variable('avg_claim_count')) gl_gc_model.create_rule('log_density', transform_variable('zip_density')) if model_inputs['exposure_type'] == 'Payroll': gl_gc_model.create_rule('payroll_ind_log_payroll_m', transform_variable('exposure_size')) gl_gc_model.create_rule('sales_ind_log_sales_m', lambda data, model: 0) else: gl_gc_model.create_rule('payroll_ind_log_payroll_m', lambda data, model: 0) gl_gc_model.create_rule('sales_ind_log_sales_m', transform_variable('exposure_size')) return gl_gc_model.prep_data_and_score(model_inputs)