/
LC_latest_predictions.py
202 lines (166 loc) · 8.72 KB
/
LC_latest_predictions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 3 15:13:14 2016
@author: James McFarland
"""
import sys
import os
import re
import numpy as np
import pandas as pd
import dill
import requests
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict, namedtuple
base_dir = os.path.dirname(os.path.realpath(__file__))
# base_dir = '/Users/james/Data_Incubator/loan-picker'
sys.path.append(base_dir)
data_dir = os.path.join(base_dir,'static/data/')
import LC_helpers as LCH
import LC_loading as LCL
import LC_modeling as LCM
#%% Load in model and transformer
def load_pickled_models():
model_data = {}
with open(os.path.join(base_dir,'static/data/LC_model.pkl'),'rb') as in_strm:
model_data['RF_est'] = dill.load(in_strm)
with open(os.path.join(base_dir,'static/data/LC_def_model.pkl'),'rb') as in_strm:
model_data['RF_defClass'] = dill.load(in_strm)
with open(os.path.join(base_dir,'static/data/trans_tuple.pkl'),'rb') as in_strm:
variables = dill.load(in_strm)
var_names = ['dict_vect','col_dict','tran_dict','predictors']
for idx in xrange(len(var_names)):
model_data[var_names[idx]] = variables[idx]
return model_data
#%% pull in latest loan data
def make_record(loan, record_map):
record = {}
for rec_name, rec_fun in record_map:
record[rec_name] = rec_fun(loan)
return record
def get_latest_records(auth_key, zip3_loc_data, use_grades = ['A','B','C','D','E','F']):
header = {'Authorization' : auth_key,
'Content-Type': 'application/json'}
apiVersion = 'v1'
loanListURL = 'https://api.lendingclub.com/api/investor/' + apiVersion + \
'/loans/listing'
payload = {'showAll' : 'true'}
resp = requests.get(loanListURL, headers=header, params=payload)
loans = resp.json()['loans']
'''Make a list of tuples specifying the name of each data column to pull, and
a function to use to grab that piece of data from the raw loan data'''
record_map = (('acc_now_delinq', lambda x: x['accNowDelinq']),
('annual_inc', lambda x: x['annualInc']),
('collections_12_mths_ex_med', lambda x: x['collections12MthsExMed']),
('cr_line_dur', lambda x: LCL.get_cr_line_dur(x['earliestCrLine'])),
('delinq_2yrs', lambda x: x['delinq2Yrs']),
('desc_length', lambda x: LCL.get_desc_length(x['desc'])),
('dti', lambda x: x['dti']),
('emp_length', lambda x: LCL.get_emp_length(x['empLength'])), #convert to years from months
('id', lambda x: x['id']),
# ('funded_amnt', lambda x: x['loanAmount']), #use amount requested rather than funded amnt!
('loan_amnt', lambda x: x['loanAmount']), #use amount requested rather than funded amnt!
('inq_last_6mths', lambda x: x['inqLast6Mths']),
('int_rate', lambda x: x['intRate']),
('mths_since_last_delinq', lambda x: LCL.get_mnths_since(x['mthsSinceLastDelinq'])),
('mths_since_last_major_derog', lambda x: LCL.get_mnths_since(x['mthsSinceLastMajorDerog'])),
('mths_since_last_record', lambda x: LCL.get_mnths_since(x['mthsSinceLastRecord'])),
('num_add_desc', lambda x: LCL.get_num_descs(x['desc'])),
('open_acc', lambda x: x['openAcc']),
('pub_rec', lambda x: x['pubRec']),
('revol_bal', lambda x: x['revolBal']),
('revol_util', lambda x: x['revolUtil']),
('term', lambda x: x['term']),
('total_acc', lambda x: x['totalAcc']),
('tot_cur_bal', lambda x: x['totCurBal']),
('addr_state', lambda x: x['addrState']),
('home_ownership', lambda x: x['homeOwnership']),
('grade', lambda x: x['grade']),
('purpose', lambda x: x['purpose']),
('latitude', lambda x: LCL.get_zip_loc(x['addrZip'], zip3_loc_data, 'latitude')),
('longitude', lambda x: LCL.get_zip_loc(x['addrZip'], zip3_loc_data, 'longitude')))
records = [make_record(loan, record_map) for loan in loans]
records = [record for record in records if record['grade'] in use_grades]
return records
#%% Build X mat for new loan data
def transform_records(records, model_data):
X_rec = model_data['dict_vect'].transform(records) #use the preloaded dict vectorizer
#apply any additional transfomers to ordinal data as needed.
feature_names = model_data['dict_vect'].get_feature_names()
# use_cols = [pred.col_name for pred in model_data['predictors']]
for idx, feature_name in enumerate(feature_names):
short_name = re.findall('[^=]*',feature_name)[0] #get the part before the equals sign, if there is onee
if short_name in model_data['tran_dict']:
tran = model_data['tran_dict'][short_name]
X_rec[:,idx] = tran.transform(X_rec[:,idx].reshape(-1,1)).squeeze()
return X_rec
#%% Generate model-predicted returns
def get_model_predictions(X, model_data, loan_grades, loan_ids):
pred_returns = model_data['RF_est'].predict(X)
pred_dp = model_data['RF_defClass'].predict_proba(X)
predictions = pd.DataFrame({'returns': pred_returns*100, 'dp':pred_dp[:,1],
'grades': loan_grades, 'ids':loan_ids})
predictions.sort_values(by='returns', ascending=False, inplace=True)
return predictions
#%%
def make_dp_ret_figure(predictions, pick_K, allowed_loans):
grade_order = sorted(predictions.grades.unique())
pal = sns.cubehelix_palette(n_colors=len(grade_order))
sns.lmplot(x='dp',y='returns',data=predictions,hue='grades',
hue_order = grade_order, fit_reg=False,
palette=pal, legend=False, size=4.0, aspect=1.2)
pick_K = min([pick_K, len(allowed_loans)])
if pick_K != 0:
plt.plot(allowed_loans.iloc[:pick_K]['dp'],allowed_loans.iloc[:pick_K]['returns'],'r.',
ms=10, label='picked')
plt.title('Selected Loans', fontsize=18)
plt.legend(loc='lower left', fontsize=12)
plt.xlabel('Predicted default probability',fontsize=14)
plt.ylabel('Predicted annual returns (%)',fontsize=14)
plt.margins(.01, .01)
return plt.gcf()
#%% Load and unpack validation lookup-table data
def get_validation_data():
with open(os.path.join(base_dir,'static/data/LC_test_res.pkl'),'rb') as in_strm:
val_set = dill.load(in_strm)
test_data = [np.array(tup) for tup in zip(*val_set)]
test_pred, test_ROI, test_net_returns, test_weights = test_data
sim_lookup = pd.DataFrame({'pred':test_pred, 'ROI':test_ROI,
'net_ret': test_net_returns, 'weights': test_weights})
sim_lookup.sort_values(by='pred', ascending=False, inplace=True)
return sim_lookup
#%%
def make_return_dist_fig(sim_lookup, predictions, pick_K=100, n_bins=200, n_boots=5000):
sim_net = sim_lookup['net_ret'].values
sim_weights = sim_lookup['weights'].values
bin_locs = np.linspace(0, 100, n_bins)[::-1]
bins = np.percentile(sim_lookup['pred'].values, bin_locs)
sim_samps_per_bin = len(sim_lookup)/float(n_bins)
pred_bins = np.digitize(predictions['returns'] / 100., bins) #find bins of first max_K points in prediction
sim_returns = np.zeros(n_boots)
boot_samps = sim_samps_per_bin*pred_bins[:pick_K] + np.random.randint(0, sim_samps_per_bin, size=(n_boots, pick_K))
boot_samps = boot_samps.astype(int)
sim_returns = np.sum(sim_net[boot_samps], axis=1) / np.sum(sim_weights[boot_samps], axis=1)
sim_returns = LCM.annualize_returns(sim_returns)
fig,ax=plt.subplots(figsize=(5.0,4.0))
sns.distplot(sim_returns,bins=100, hist=False, rug=False,
ax=ax, kde_kws={'color':'k','lw':3})
plt.xlabel('Annual returns (%)',fontsize=14)
plt.ylabel('Probability',fontsize=14)
plt.title('Estimated portfolio returns', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.margins(.01, .01)
plt.tight_layout()
return fig
def get_LC_loans(auth_key, model_data, zip3_loc_data, use_grades):
records = get_latest_records(auth_key, zip3_loc_data, use_grades)
loan_grades = [rec['grade'] for rec in records]
loan_ids = [rec['id'] for rec in records]
X = transform_records(records, model_data)
predictions = get_model_predictions(X, model_data, loan_grades, loan_ids)
return predictions