/
smart.py
190 lines (163 loc) · 6.64 KB
/
smart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import pandas as pd
from scipy.stats import pointbiserialr, spearmanr
from datetime import datetime
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
columns = ['ID','Office_PIN','Application_Receipt_Date','Applicant_City_PIN',
'Applicant_Gender','Applicant_BirthDate','Applicant_Marital_Status',
'Applicant_Occupation','Applicant_Qualification','Manager_DOJ',
'Manager_Joining_Designation','Manager_Current_Designation',
'Manager_Grade','Manager_Status','Manager_Gender','Manager_DoB',
'Manager_Num_Application','Manager_Num_Coded','Manager_Business',
'Manager_Num_Products','Manager_Business2','Manager_Num_Products2',
'Business_Sourced']
def load_train(filename,dest):
df = pd.read_csv(filename)
# print df.describe()
print df["Applicant_Marital_Status"].describe()
print df["Applicant_Marital_Status"].unique()
# print df["Applicant_City_PIN"].describe()
# df.loc[df["Applicant_Gender"]=="M","Applicant_Gender"]=0
# df.loc[df["Applicant_Gender"]=="F","Applicant_Gender"]=1
#
#
# # print df.columns.values
# # print df[df["Applicant_Gender"]=="M"][df["Business_Sourced"]==0].describe()
# print df[df["Applicant_Gender"]=="M"].describe()
f = ["Applicant_Gender","Applicant_Marital_Status","Applicant_Qualification","Applicant_Occupation","Manager_Joining_Designation","Manager_Current_Designation","Manager_Grade","Manager_Status","Manager_Gender"]
for k in f:
a = df[k].unique()
for i,j in enumerate(a, start=0):
df.loc[df[k]==j,k]=i
# print df.head(100)
df["Manager_exp"] = df.apply(ret_days, axis=1)
df["App_age"] = df.apply(ret_app_age, axis=1)
df["Manager_age"] = df.apply(ret_man_age, axis=1)
# print df.describe()
print df["App_age"].median()
df['App_age'] = df['App_age'].fillna(df["App_age"].median())
print df["Manager_age"].median()
df['Manager_age'] = df['Manager_age'].fillna(df["Manager_age"].median())
df['Manager_exp'] = df['Manager_exp'].fillna(df["Manager_exp"].median())
df['Applicant_Qualification'] = df['Applicant_Qualification'].fillna(1)
df['Applicant_Gender'] = df['Applicant_Gender'].fillna(0)
df['Manager_Gender'] = df['Manager_Gender'].fillna(0)
df['Manager_Status'] = df['Manager_Status'].fillna(0)
df['Applicant_Occupation'] = df['Applicant_Occupation'].fillna(2)
c = ['Manager_Num_Application','Manager_Num_Coded','Manager_Business','Manager_Num_Products','Manager_Business2','Manager_Num_Products2']
for i in c:
df[i] = df[i].fillna(df[i].median())
y = df["Business_Sourced"]
df = df.drop(["ID","Business_Sourced","Applicant_BirthDate","Application_Receipt_Date","Manager_DoB","Manager_DOJ","Applicant_City_PIN","Manager_Grade","Office_PIN","Applicant_Marital_Status","Manager_Current_Designation","Manager_Joining_Designation"],axis=1)
# print df.describe()
# print df.head(10)
print df.columns.values
X=df
estimator = LogisticRegression()
selector = RFE(estimator ,step=1)
selector = selector.fit(X, y)
print selector.n_features_
print selector.support_
print selector.ranking_
print selector.estimator_
# df = df.drop(["Applicant_BirthDate","Application_Receipt_Date","Manager_DoB","Manager_DOJ","Applicant_City_PIN","Manager_Grade","Office_PIN","Applicant_Marital_Status","Manager_Num_Coded","Manager_Num_Products","Manager_Num_Products2","Manager_Current_Designation","Manager_Joining_Designation","Manager_exp"],axis=1)
# print df.columns.values
# print df.describe()
# df.to_csv(dest,index=False)
# for i in columns:
# print i
# print df[i].describe()
# print df[i].unique().size
def ret_days(df):
try:
# print df
date_for = "%m/%d/%Y"
a = datetime.strptime(df["Application_Receipt_Date"],date_for)
# print a
b = datetime.strptime(df["Manager_DOJ"],date_for)
# print b
d = a-b
# print (d.days)/365.0
return (d.days)/365.0
except:
return None
def ret_app_age(df):
try:
# print df
date_for = "%m/%d/%Y"
a = datetime.strptime(df["Application_Receipt_Date"],date_for)
# print a
b = datetime.strptime(df["Applicant_BirthDate"],date_for)
# print b
d = a-b
# print (d.days)/365.0
return (d.days)/365.0
except:
return None
def ret_man_age(df):
try:
# print df
date_for = "%m/%d/%Y"
a = datetime.strptime(df["Application_Receipt_Date"],date_for)
# print a
b = datetime.strptime(df["Manager_DoB"],date_for)
# print b
d = a-b
# print (d.days)/365.0
return (d.days)/365.0
except:
return None
def correlation():
df = pd.read_csv("dataset/train_new.csv")
# df = df.dropna(axis=0,how="any")
print df.describe()
# print df.head()
param=[]
correlation=[]
abs_corr=[]
covariance = []
columns = ["Applicant_Gender","App_age","Applicant_Occupation","Applicant_Qualification","Manager_age","Manager_Status","Manager_Gender","Manager_Business","Manager_Business2","Manager_Num_Application"]
for c in columns:
#Check if binary or continuous
if len(df[c].unique())<=12:
corr = spearmanr(df['Business_Sourced'],df[c])[0]
print "spear",c,corr
y = df['Business_Sourced']
x = df[c]
X = np.vstack((y,x))
covar = np.cov(X)
else:
corr = pointbiserialr(df['Business_Sourced'],df[c])[0]
print "point",c,corr
y = df['Business_Sourced']
x = df[c]
X = np.vstack((y,x))
covar = np.cov(X)
param.append(c)
correlation.append(corr)
abs_corr.append(abs(corr))
# covariance.append(covar[0][1])
print covariance
def ana_test():
df = pd.read_csv("dataset/train_new.csv")
print df["Business_Sourced"].describe()
# for i in columns[:-1]:
# print i
# print df[i].describe()
# print df[i].unique().size
if __name__ == "__main__":
# load_train("dataset/Test_wyCirpO.csv","dataset/test_new.csv")
load_train("dataset/Train_pjb2QcD.csv","dataset/train_new.csv")
# correlation()
# ana_test()
# spear Applicant_Gender 0.0501596389925
# point App_age 0.0664923157893
# spear Applicant_Occupation -0.0293786499627
# spear Applicant_Qualification -0.0402997522036
# point Manager_age 0.0236875061366
# spear Manager_Status 0.0424019758619
# spear Manager_Gender 0.0329010430742
# point Manager_Business 0.0304204461244
# point Manager_Business2 0.0299538181628
# point Manager_Num_Application -0.0362155467124