def test_unsupervised(self): observations = lib.load_lending_club() # Train /test split train_observations, test_observations = train_test_split(observations) train_observations = train_observations.copy() test_observations = test_observations.copy() # Unsupervised data_type_dict = { 'numerical': [ 'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies' ], 'categorical': [ 'term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state', 'application_type' ], 'text': ['desc', 'purpose', 'title'] } auto = Automater(data_type_dict=data_type_dict) self.assertFalse(auto.supervised) expected_input_vars = reduce(lambda x, y: x + y, data_type_dict.values()) self.assertCountEqual(expected_input_vars, auto.input_vars) self.assertEqual(None, auto.output_var) self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper)) self.assertIsNone(auto.output_mapper) self.assertFalse(auto.fitted) self.assertRaises(AssertionError, auto._check_has_response_var) # Test fit auto.fit(train_observations) self.assertTrue(auto.fitted) self.assertIsNotNone(auto.input_mapper.built_features) self.assertTrue(isinstance(auto.input_layers, list)) self.assertEqual(len(expected_input_vars), len(auto.input_layers)) self.assertIsNotNone(auto.input_nub) self.assertIsNone(auto.output_nub) self.assertIsNone(auto.output_mapper) # Test transform, df_out=False X, y = auto.transform(test_observations) self.assertTrue(isinstance(X, list)) self.assertIsNone(y) self.assertEqual(test_observations.shape[0], X[0].shape[0]) # Correct number of rows back # Test transform, df_out=True transformed_observations = auto.transform(test_observations, df_out=True) self.assertTrue(isinstance(transformed_observations, pandas.DataFrame)) self.assertEqual( test_observations.shape[0], transformed_observations.shape[0]) # Correct number of rows back
def main(): # List out which components are supplied by Automater # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers, # auto.output_nub, and auto.suggest_loss save_results = True # Load data observations = lib.load_lending_club() print('Observation columns: {}'.format(list(observations.columns))) print('Class balance:\n {}'.format(observations['loan_status'].value_counts())) # Train /test split train_observations, test_observations = train_test_split(observations) train_observations = train_observations.copy() test_observations = test_observations.copy() # List out variable types data_type_dict = {'numerical': ['loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies'], 'categorical': ['term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state', 'application_type', 'disbursement_method'], 'text': ['desc', 'purpose', 'title']} output_var = 'loan_status' # Create and fit Automater auto = Automater(data_type_dict=data_type_dict, output_var=output_var) auto.fit(train_observations) # Transform data train_X, train_y = auto.fit_transform(train_observations) test_X, test_y = auto.transform(test_observations) # Create and fit keras (deep learning) model. x = auto.input_nub x = Dense(32)(x) x = Dense(32)(x) x = auto.output_nub(x) model = Model(inputs=auto.input_layers, outputs=x) model.compile(optimizer='adam', loss=auto.suggest_loss()) model.fit(train_X, train_y) # Make model predictions and inverse transform model predictions, to get usable results pred_test_y = model.predict(test_X) auto.inverse_transform_output(pred_test_y) # Save all results if save_results: temp_dir = lib.get_temp_dir() model.save(os.path.join(temp_dir, 'model.h5py')) pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb')) pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb')) pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb')) pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb')) pickle.dump(pred_test_y, open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))
def main(): logging.getLogger().setLevel(logging.INFO) # Reference variables test_run = True observations = load_lending_club() if test_run: observations = observations.sample(n=100) # Transform the data set, using keras_pandas categorical_vars = [ 'term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method', 'loan_status' ] numerical_vars = [ 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util' ] text_vars = ['desc', 'title'] for categorical_var in categorical_vars: observations[categorical_var] = observations[categorical_var].fillna( 'None') observations[categorical_var] = observations[categorical_var].apply( str) auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars, response_var='loan_status') X, y = auto.fit_transform(observations) # Start model with provided input nub x = auto.input_nub # Fill in your own hidden layers x = Dense(8)(x) x = Dense(16, activation='relu')(x) x = Dense(8)(x) # End model with provided output nub x = auto.output_nub(x) model = Model(inputs=auto.input_layers, outputs=x) model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy']) # Train model logging.warning( 'Settle in! This training normally takes about 5-20 minutes on CPU') model.fit(X, y, epochs=1, validation_split=.2) pass
def test_lending(self): observations = load_lending_club() # Check datatypes self.assertIsInstance(observations, pandas.DataFrame) # Check columns self.assertCountEqual([ 'id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'disbursement_method', 'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status', 'settlement_date', 'settlement_amount', 'settlement_percentage', 'settlement_term' ], observations.columns)
def test_inverse_transform_numerical_response(self): # :oad data observations = lib.load_lending_club() # Set to test run observations = observations.sample(n=100) # Declare variable types categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method', 'loan_status'] numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util'] text_vars = ['desc', 'title'] # Manual null filling for categorical_var in categorical_vars: observations[categorical_var] = observations[categorical_var].fillna('None') observations[categorical_var] = observations[categorical_var].apply(str) auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars, response_var='funded_amnt') X, y = auto.fit_transform(observations) # Start model with provided input nub x = auto.input_nub # Fill in your own hidden layers x = Dense(8)(x) x = Dense(16, activation='relu')(x) x = Dense(8)(x) # End model with provided output nub x = auto.output_nub(x) model = Model(inputs=auto.input_layers, outputs=x) model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy']) # Train model logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU') model.fit(X, y, epochs=1, validation_split=.2) unscaled_preds = model.predict(X) logging.debug('unscaled_preds: {}'.format(list(unscaled_preds))) scaled_preds = auto.inverse_transform_output(unscaled_preds) logging.debug('scaled_preds: {}'.format(list(scaled_preds))) self.assertNotAlmostEquals(0, numpy.mean(scaled_preds)) self.assertNotAlmostEquals(1, numpy.std(scaled_preds))
def main(): # Load data observations = lib.load_lending_club() print('Observation columns: {}'.format(list(observations.columns))) print('Class balance:\n {}'.format( observations['loan_status'].value_counts())) # Heuristic data transformations for var in ['int_rate', 'revol_util']: # Strip out percent signs observations[var] = observations[var].apply( lambda x: str(x).replace('%', '')) observations[var] = pandas.to_numeric(observations[var], errors='coerce') for var in ['mths_since_last_delinq', 'annual_inc_joint']: # Heuristic null filling for some variables observations[var] = observations[var].fillna(0) # List out variable types numerical_vars = [ 'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies' ] categorical_vars = [ 'term', 'grade', 'emp_length', 'home_ownership', 'addr_state', 'application_type', 'disbursement_method' ] text_vars = ['desc', 'purpose', 'title'] # Train /test split train_observations, test_observations = train_test_split(observations) train_observations = train_observations.copy() test_observations = test_observations.copy() # Create and fit Automater auto = Automater(numerical_vars=numerical_vars, categorical_vars=categorical_vars, text_vars=text_vars, response_var='loan_amnt') auto.fit(train_observations) # Create and fit keras (deep learning) model # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and # everything else is core Keras train_X, train_y = auto.transform(train_observations) test_X, test_y = auto.transform(test_observations) x = auto.input_nub x = Dense(32)(x) x = Dense(32, activation='relu')(x) x = Dense(32)(x) x = auto.output_nub(x) model = Model(inputs=auto.input_layers, outputs=x) model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy']) model.fit(train_X, train_y) test_y_pred = model.predict(test_X) # Inverse transform model output, to get usable results and save all results test_observations[auto.response_var + '_pred'] = auto.inverse_transform_output(test_y_pred) print('Predictions: {}'.format(test_observations[auto.response_var + '_pred'])) pass
def test_supervised(self): observations = lib.load_lending_club() # Train /test split train_observations, test_observations = train_test_split(observations) train_observations = train_observations.copy() test_observations = test_observations.copy() # Supervised data_type_dict = { 'numerical': [ 'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies' ], 'categorical': [ 'term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state', 'application_type' ], 'text': ['desc', 'purpose', 'title'] } output_var = 'loan_status' auto = Automater(data_type_dict=data_type_dict, output_var=output_var) self.assertTrue(auto.supervised) expected_input_vars = reduce(lambda x, y: x + y, data_type_dict.values()) expected_input_vars.remove(output_var) self.assertCountEqual(expected_input_vars, auto.input_vars) self.assertEqual(output_var, auto.output_var) self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper)) self.assertTrue(isinstance(auto.output_mapper, DataFrameMapper)) self.assertFalse(auto.fitted) self.assertRaises(AssertionError, auto._check_fitted) # Test fit auto.fit(train_observations) self.assertTrue(auto.fitted) self.assertIsNotNone(auto.input_mapper.built_features) self.assertTrue(isinstance(auto.input_layers, list)) self.assertEqual(len(expected_input_vars), len(auto.input_layers)) self.assertIsNotNone(auto.input_nub) self.assertIsNotNone(auto.output_nub) self.assertIsNotNone(auto.output_mapper.built_features) # Test transform, df_out=False train_X, train_y = auto.transform(train_observations) test_X, test_y = auto.transform(test_observations) self.assertTrue(isinstance(test_X, list)) self.assertTrue(isinstance(test_y, numpy.ndarray)) self.assertEqual(test_observations.shape[0], test_X[0].shape[0]) # Correct number of rows back self.assertEqual(test_observations.shape[0], test_y.shape[0]) # Correct number of rows back # Test transform, df_out=True transformed_observations = auto.transform(test_observations, df_out=True) self.assertTrue(isinstance(transformed_observations, pandas.DataFrame)) self.assertEqual( test_observations.shape[0], transformed_observations.shape[0]) # Correct number of rows back # Test suggest_loss suggested_loss = auto.suggest_loss() self.assertTrue(callable(suggested_loss)) # Test model building x = auto.input_nub x = Dense(32)(x) x = auto.output_nub(x) model = Model(inputs=auto.input_layers, outputs=x) model.compile(optimizer='Adam', loss=auto.suggest_loss()) model.fit(train_X, train_y) pred_y = model.predict(test_X) # Test inverse_transform_output inv_transformed_pred_y = auto.inverse_transform_output(pred_y) self.assertEqual(test_observations.shape[0], inv_transformed_pred_y.shape[0])