target = 'crash' # Drop data with missing values for target (price) drops = [] for i in range(df1.shape[0]): if pd.isnull(df1['crash'][i]): drops.append(i) df1 = df1.drop(drops) # In[23]: encoding = 'one-hot' scale = None # Interval scaling: Use 'std', 'robust' or None # drop=False - do not drop last category - used for Decision Trees rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding=encoding, interval_scale=scale, drop=False, display=True) # In[24]: df1.drop('crash', axis=1, inplace=True) # In[25]: encoded_df = rie.fit_transform(df1) # In[26]: #varlist = [target, 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','points'] X = encoded_df.drop(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7'], axis=1)
'job': [2, (1, 2, 3, 4), [0, 0]], 'housing': [2, (1, 2, 3), [0, 0]], 'foreign': [1, (1, 2), [0, 0]], 'marital': [2, (1, 2, 3, 4), [0, 0]], 'resident': [2, (1, 2, 3, 4), [0, 0]], 'savings': [2, (1, 2, 3, 4, 5), [0, 0]], 'other': [2, (1, 2, 3), [0, 0]], # 'purpose':[1,('0','1','2','3','4','5','6','7','8','9','X'),[0,0]], 'property': [2, (1, 2, 3, 4), [0, 0]], 'checking': [2, (1, 2, 3, 4), [0, 0]], 'telephon': [1, (1, 2), [0, 0]] } rie = ReplaceImputeEncode(data_map=attribute_map, drop=False, nominal_encoding='one-hot', display=True, interval_scale='std') encoded_df = rie.fit_transform(df) # In[4]: X = encoded_df.drop('good_bad', axis=1) Y = encoded_df['good_bad'] np_y = np.ravel(Y) features = X.columns classes = ['Good', 'bad'] # In[9]:
'history': [2, (0, 1, 2, 3, 4), [0, 0]], 'existcr': [2, (1, 2, 3, 4), [0, 0]], 'installp': [2, (1, 2, 3, 4), [0, 0]], 'job': [2, (1, 2, 3, 4), [0, 0]], 'housing': [2, (1, 2, 3), [0, 0]], 'foreign': [1, (1, 2), [0, 0]], 'marital': [2, (1, 2, 3, 4), [0, 0]], 'resident': [2, (1, 2, 3, 4), [0, 0]], 'savings': [2, (1, 2, 3, 4, 5), [0, 0]], 'other': [2, (1, 2, 3), [0, 0]], 'property': [2, (1, 2, 3, 4), [0, 0]], 'checking': [2, (1, 2, 3, 4), [0, 0]], 'telephon': [2, (1, 2), [0, 0]] } rie = ReplaceImputeEncode(data_map=attribute_map, display=True) encoded_df = rie.fit_transform(df) # In[5]: from collections import Counter Counter(encoded_df['employed0']) len(encoded_df.columns) # 46 columns Counter(encoded_df['good_bad']) # In[6]: X = encoded_df.drop('good_bad', axis=1) y = encoded_df['good_bad']
'MonthlyCharges': [0, (18.25, 188.75), [0, 0]], 'TotalCharges': [0, (0, 8700), [0, 0]], #int as str hides max value! 'Churn': [1, ('Yes', 'No'), [0, 0]], } #Define the target target = ['Churn'] #Logistics Regression max_f1 = 0 score_list = ['accuracy', 'recall', 'precision', 'f1'] #Encode for logistic regressions rie_l = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', interval_scale='std', drop=True, display=True) encoded_df_l = rie_l.fit_transform(df) X_l = encoded_df_l.drop(target, axis=1) y_l = encoded_df_l[target] np_y_l = np.ravel(y_l) #convert dataframe column to flat array #Do feature selection using random forest classifiers to determine which #predictors to include in the logistic regression features = ExtraTreesClassifier(n_estimators=500) features.fit(X_l, np_y_l) print(features.feature_importances_) #Only the interval predictors are important #Try two logistic models: one with all predictors, one with only the top 3 #predictors
sentiment_score[i] = sentiment_score[i] / n_sw df_senscore = pd.DataFrame(sentiment_score, columns=['sentiment score']) df = df.join(df_senscore) # classify topic based on the probability df['topic'] = 0 for ix, row in df.iterrows(): mx = row[['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7']].max() b = (row == mx).idxmax(axis=1) df.loc[ix, 'topic'] = b # save the data output of NLP df.to_csv('after_NLP_data.csv', index=False) # scale data rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', \ interval_scale=None, drop=False, display=True) df_tree = rie.fit_transform(df) y = df_tree['crash'] X = df_tree.drop('crash', axis=1) # find the best tree depth depth_list = [3, 5, 6, 7, 8, 10, 12, 15, 20, 25] score_list = ['accuracy', 'recall', 'precision', 'f1'] for d in depth_list: print("\nMaximum Tree Depth: ", d) dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \ min_samples_split=5,random_state=12345) dtc = dtc.fit(X, y) scores = cross_validate(dtc, X, y, scoring=score_list, \ return_train_score=False, cv=10)
'Feb_PayPercent':[0,(0, 1),[0,0]], 'Jan_PayPercent':[0,(0, 1),[0,0]] } # In[14]:df.drop(['Customer'], axis=1) # In[6]: np.sum(df['Marital_Status']==0) df.dtypes # In[15]: rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', display=True) encoded_df = rie.fit_transform(df) varlist = df['Default'] X = encoded_df.drop('Default', axis=1) y = encoded_df['Default'] lgr = LogisticRegression() #Selecting the best attributes using RFE - 25 attributes chosen rfe = RFE(lgr,25) rfe = rfe.fit(X,y) print(rfe.support_)
'foreign':[1,(1,2),[0,0]], 'good_bad':[1,('bad', 'good'),[0,0]], 'history':[2,(0,1,2,3,4),[0,0]], 'installp':[2,(1,2,3,4),[0,0]], 'job':[2,(1,2,3,4),[0,0]], 'marital':[2,(1,2,3,4),[0,0]], 'other':[2,(1,2,3),[0,0]], 'property':[2,(1,2,3,4),[0,0]], # 'purpose':[1,(0,1,2,3,4,5,6,7,8,9,'X'),[0,0]], 'resident':[2,(1,2,3,4),[0,0]], 'savings':[2,(1,2,3,4,5),[0,0]], 'telephon':[1,(1,2),[0,0]] } Step 3: Replace-Impute-Encode Next, use the class ReplaceImputeEncode() to replace outliers with missing values, impute missing values and then scale interval data and encode categorial data. The ReplaceImputeEncode() class allows you to specify None for scaling and/or encoding. It also lets you select 'one-hot' or 'SAS' encoding for categorical variables. In most other software this is automatic, but for Python we need to setup our own scaling and encoding. The complete API for this class is described in the class. First you instantiate the class then you use fit_transform() to actually process your dataframe. In [3]: encoding = 'SAS' # Categorical encoding: Use 'SAS', 'one-hot' or None scale = None # Interval scaling: Use 'std', 'robust' or None scaling = 'No' # Text description for interval scaling rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding=encoding, \ interval_scale = scale, display=True) #features_map = rie.draft_features_map(df) encoded_df = rie.fit_transform(df) ********** Data Preprocessing *********** Features Dictionary Contains:
attribute_map_clus = { 'Score' :[0,(80,100),[0,0]], 'Year' :[0,(1985,2016),[0,0]], 'Region' :[2,('California Other', 'Central Coast','Central Valley', \ 'Clear Lake','High Valley', 'Lake County',\ 'Mendocino County','Mendocino Ridge',\ 'Mendocino/Lake Counties', 'Napa','Napa-Sonoma',\ 'North Coast','Red Hills Lake County','Redwood Valley',\ 'Sierra Foothills','Sonoma','South Coast'),[0,0]], 'Cluster' :[2,(0,1,2,3,4,5,6,7,8),[0,0]], 'Price' :[0,(0,625),[0,0]] } varlist = ['Price'] rie_clus = ReplaceImputeEncode(data_map=attribute_map_clus, \ nominal_encoding='one-hot', interval_scale = None, drop=True, display=False) encoded_df_clus = rie_clus.fit_transform(clus) X_clus = encoded_df_clus.drop(varlist, axis=1) y_clus = encoded_df_clus[varlist] X_train, X_valid, y_train, y_valid= \ train_test_split(X_clus,y_clus,test_size = 0.3, random_state=7) np_y_train = np.ravel(y_train) np_y_valid = np.ravel(y_valid) reg = LinearRegression() reg.fit(X_train,np_y_train)
'model': [2, ('COBALT', 'G5', 'HHR', 'ION', 'SKY', 'SOLSTICE'), [0, 0]], 'crashed': [1, ('N', 'Y'), [0, 0]], 'abs': [1, ('N', 'Y'), [0, 0]], 'mileage': [0, (0, 200000), [0, 0]], '0': [0, (0, 1), [0, 0]], '1': [0, (0, 1), [0, 0]], '2': [0, (0, 1), [0, 0]], '3': [0, (0, 1), [0, 0]], '4': [0, (0, 1), [0, 0]], '5': [0, (0, 1), [0, 0]], '6': [0, (0, 1), [0, 0]], '7': [0, (0, 1), [0, 0]], } varlist = ['crashed'] rie = ReplaceImputeEncode(data_map=attribute_map, \ nominal_encoding='one-hot', interval_scale = None, drop=True, display=False) encoded_df = rie.fit_transform(reg_df) X = encoded_df.drop(varlist, axis=1) y = encoded_df[varlist] np_y = np.ravel(y) #10 fold-cross validation to find optimum regularization value max_f1 = 0 C_list = [.1, 1, 10, 100] score_list = ['accuracy', 'recall', 'precision', 'f1'] for c in C_list: print("\nRegularization Parameter: ", c) lgr = LogisticRegression(C=c, tol=1e-8, max_iter=1000) lgr.fit(X, np_y) scores = cross_validate(lgr, X, np_y,\
'savings': [2, (1, 2, 3, 4, 5), [0, 0]], 'telephon': [1, (1, 2), [0, 0]] } sas_map = { 'duration': [0, (0, 100), [0, 0]], 'checking': [2, (1, 2, 3, 4), [0, 0]], 'coapp': [2, (1, 2, 3), [0, 0]], 'history': [2, (0, 1, 2, 3, 4), [0, 0]], 'good_bad': [1, ('good', 'bad'), [0, 0]], 'savings': [2, (1, 2, 3, 4, 5), [0, 0]], 'installp': [2, (1, 2, 3, 4), [0, 0]], 'marital': [2, (1, 2, 3, 4), [0, 0]] } #Replace, impute, and encode using SAS encoding rep_imp_enc = ReplaceImputeEncode(data_map=attribute_map, display=True) encoded_df = rep_imp_enc.fit_transform(df) # Regression requires numpy arrays containing all numeric values y = np.asarray(encoded_df['good_bad']) # Drop the target, 'object'. Axis=1 indicates the drop is for a column. X = np.asarray(encoded_df.drop('good_bad', axis=1)) #Fit a logistic regression model, use k=4 fold cross validation X_train, X_validate, y_train, y_validate = \ train_test_split(X,y,test_size = 0.3, random_state=7) logistic = LogisticRegression() logistic.fit(X, y) log_tts = LogisticRegression() log_tts.fit(X_train, y_train)