def feature_importance(url, dataloaded, rows): # If dataset is not loaded if dataloaded is None: return [], "No file" # Get dataset if pickle exists data_id = int(re.search(r"data/(\d+)", url).group(1)) try: df = pd.read_pickle("cache/df" + str(data_id) + ".pkl") except OSError: return [], "No file" # Get table of metadata meta_data = pd.DataFrame(rows) try: target_attribute = meta_data[meta_data["Target"] == "true"][ "Attribute" ].values[0] target_type = meta_data[meta_data["Target"] == "true"]["DataType"].values[0] except IndexError: return "No target found", "No target found" # Feature importance bar plot from category_encoders.target_encoder import TargetEncoder x = df.drop(target_attribute, axis=1) y = df[target_attribute] te = TargetEncoder() if target_type == "nominal" or target_type == "string": y = pd.Categorical(y).codes x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestClassifier(n_estimators=10, n_jobs=-1) rf.fit(x, y) else: x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestRegressor(n_estimators=10, n_jobs=-1) rf.fit(x, y) fi = pd.DataFrame( rf.feature_importances_, index=x.columns, columns=["importance"] ) fi = fi.sort_values("importance", ascending=False).reset_index() trace = go.Bar(y=fi["index"], x=fi["importance"], name="fi", orientation="h") layout = go.Layout( autosize=False, margin={"l": 100, "t": 0}, height=500, hovermode="closest" ) figure = go.Figure(data=[trace], layout=layout) fi.to_pickle("cache/fi" + str(data_id) + ".pkl") return html.Div(dcc.Graph(figure=figure), className="twelve columns"), "done"
def feature_importance(url, tab3, rows): data_id = int(re.search('data/(\d+)', url).group(1)) try: df = pd.read_pickle('cache/df' + str(data_id) + '.pkl') except OSError: return [], "No file" meta_data = pd.DataFrame(rows) try: target_attribute = meta_data[meta_data["Target"] == "true"]["Attribute"].values[0] target_type = ( meta_data[meta_data["Target"] == "true"]["DataType"].values[0]) except IndexError: return "No target found", "No target found" # Feature importance bar plot from category_encoders.target_encoder import TargetEncoder x = df.drop(target_attribute, axis=1) y = df[target_attribute] te = TargetEncoder() if target_type == "nominal" or target_type == "string": y = pd.Categorical(y).codes x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestClassifier(n_estimators=10, n_jobs=-1) rf.fit(x, y) else: x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestRegressor(n_estimators=10, n_jobs=-1) rf.fit(x, y) fi = pd.DataFrame(rf.feature_importances_, index=x.columns, columns=['importance']) fi = fi.sort_values('importance', ascending=False).reset_index() trace = go.Bar(y=fi['index'], x=fi['importance'], name='fi', orientation='h') layout = go.Layout(autosize=False, margin=dict(l=100), width=800, height=500, hovermode='closest') figure = go.Figure(data=[trace], layout=layout) fi.to_pickle('cache/fi' + str(data_id) + '.pkl') return html.Div(dcc.Graph(figure=figure)), "done"
def target_encoder(params): train = params[0].astype('str') test = params[1].astype('str') target = params[2] te = TargetEncoder(return_df=False) train = te.fit_transform(train.reshape(-1, 1), target.reshape(-1, 1)) test = te.transform(test.reshape(-1, 1)) return train.flatten(), test.flatten()
def target_encode(data, label, encoder=None): """ :param data: :param label: :param encoder: if supplied the encoder will be used to predict onto data :return: """ if encoder is None: encoder = TargetEncoder() data = encoder.fit_transform(data, label) return encoder, data else: return encoder, encoder.transform(data, label)
def target_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. The value must be strictly bigger than 0. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") min_samples_leaf = set_default_vale("min_samples_leaf", configger, 1) smoothing = set_default_vale("smoothing", configger, 1.0) encoder = TargetEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, handle_missing=handle_missing, handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing) res = encoder.fit_transform(X, y) return res
def target_encode_Stores(df, enc=None): """Target encode the Store variable using the category_encoders module Args: df: Data enc: Existing Encoder / if None retrain new encoder """ target = df['Sales'].values stores = df['Store'].astype(str) if not enc: print("Fit TargetEncoder...") enc = TargetEncoder() new_store = enc.fit_transform(stores, target) else: print("Transform using existing TargetEncoder...") new_store = enc.transform(stores, target) df.loc[:, 'Store'] = new_store return new_store, enc
def target_encode_custom(df: pd.DataFrame, name: str, enc=None): """Target encode the Store variable using the category_encoders module Args: df: Data name (str): name of the column to encode enc: Existing Encoder / if None retrain new encoder """ target = df['Sales'].values stores = df[name].astype(str) if not enc: print("Fit TargetEncoder...") enc = TargetEncoder() new_store = enc.fit_transform(stores, target) else: print("Transform using existing TargetEncoder...") new_store = enc.transform(stores, target) df.loc[:, name] = new_store return new_store, enc
def target_encode(X, X_test, cols, y): te = TargetEncoder(cols=cols, return_df=True) X = te.fit_transform(X, y) X_test = te.transform(X_test) return (X, X_test)
,colsample_bytree=.2 ,reg_alpha=.1 ,reg_lambda=.1 ) return lgbr # 本地验证 kf = KFold(n_splits=10, shuffle=True, random_state=100) devscore = [] for tidx, didx in kf.split(train.index): tf = train.iloc[tidx] df = train.iloc[didx] tt = target.iloc[tidx] dt = target.iloc[didx] te = TargetEncoder(cols=tecols) tf = te.fit_transform(tf, tt) df = te.transform(df) lgbr = makelgb() lgbr.fit(tf, tt) pre = lgbr.predict(df) fpr, tpr, thresholds = roc_curve(dt, pre) score = auc(fpr, tpr) devscore.append(score) print(np.mean(devscore)) # # 在整个train集上重新训练,预测test,输出结果 # lgbr = makelgb() # te = TargetEncoder(cols=tecols) # tf = te.fit_transform(train, target) # df = te.transform(test) # lgbr.fit(tf, target)
def lin_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) Drops NaN from training data, Replaces NaN in test data with ffill, target-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, selects features using RFECV, with a lasso mode, cv set to 5, uses KNeighborRegressor for 11 nearest neighbours weighted to distance """ print("cleaning data...") clean_labelled = labelled_data.dropna() clean_unlabelled = unlabelled_data[all_columns] # not ideal but fillna the mean freezes for some reason clean_unlabelled = clean_unlabelled.fillna(method="ffill") # clean_unlabelled = clean_unlabelled.fillna("None") # remove some columns # clean_labelled = drop_columns(clean_labelled) # clean_unlabelled = drop_columns(clean_unlabelled) # print("one hot encoding data...") # One hot encoding # ohe = OneHotEncoder( # categories="auto", # handle_unknown="ignore", # sparse=False # ) # clean_labelled = encode_training(ohe, clean_labelled) # clean_unlabelled = encode_testing(ohe, clean_unlabelled) clean_labelled = constrain_col_vals(clean_labelled) clean_unlabelled = constrain_col_vals(clean_unlabelled) unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("target encoding data...") # Target encoding tar_encode = TargetEncoder() train_data = tar_encode.fit_transform(train_data, train_target) test_data = tar_encode.transform(test_data) unknown_data = tar_encode.transform(unknown_data) print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("selecting features...") # feature selection lasso = lm.Lasso() selector = RFECV(lasso, cv=5) train_data = selector.fit_transform(train_data, train_target) test_data = selector.transform(test_data) unknown_data = selector.transform(unknown_data) print("fitting model...") # fit model # lasso = lm.LassoCV(cv=5) # lasso.fit(train_data, train_target) neigh = KNeighborsRegressor( n_neighbors=11, weights="distance" ) neigh.fit(train_data, train_target) print("analysing test results...") # validate test test_result = neigh.predict(test_data) error = np.sqrt(mean_squared_error(test_target, test_result)) variance = explained_variance_score(test_target, test_result) print("Root mean squared error of test data: ", error) print("Variance: ", variance) print("predicting unknown data...") # predict and format values = neigh.predict(unknown_data) results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Income": values.flatten() }) print("Finished.") return results
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1])) if self.discardOriginal_col: X = X.drop(self.targetName, axis=1) return X display(df) temp = [] nfolds = [2, 3, 4] for n in nfolds: te = KFoldTargetEncoderTrain(colnames='horse', targetName='label', n_fold=n, verbosity=False) temp.append(te.fit_transform(df).copy().iloc[:, -1]) temp # 顯示結果差異很大隨著n_folds # The code beow seems have been not compatible with the code above (because I have revied the one above). class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin): def __init__(self, train, colNames, encodedName): self.train = train self.colNames = colNames self.encodedName = encodedName def fit(self, X, y=None): return self def transform(self, X): # read the encoded training data first into a dictionary
def target_encoder(self): for i in self.categorical_features: te = TargetEncoder() self.data[f"{i}_te"] = te.fit_transform(self.data[i], self.data[self.target]) self.encoded_features.append(f"{i}_te")
#MEE_encoder = MEstimateEncoder() #train_mee = MEE_encoder.fit_transform(train_set[feature_list], target) #test_mee = MEE_encoder.transform(test_set[feature_list]) #print(train_mee.head()) X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97) lr = LinearRegression() rf = RandomForestRegressor() # In[48]: TE_encoder = TargetEncoder() train_te = TE_encoder.fit_transform(train_set[feature_list], target) test_te = TE_encoder.transform(test_set[feature_list]) #print(train_te.head()) encoder_list = [ TargetEncoder(), MEstimateEncoder()] X_train, X_val, y_train, y_val = train_test_split(train_set, target, test_size=0.2, random_state=97) #X_train, X_val, y_train, y_val = dataset_test() lr = LinearRegression() for encoder in encoder_list: # print("Test {} : ".format(str(encoder).split('(')[0]), end=" ") train_enc = encoder.fit_transform(X_train[feature_list], y_train) # test_enc = encoder.transform(test[feature_list]) val_enc = encoder.transform(X_val[feature_list]) lr.fit(train_enc, y_train)
# 查看数据数据 train_data.head() from sklearn.model_selection import train_test_split # n2 与 n3 重复性较高,删除 n2 train_data.drop(['n2', 'issueDate'], axis=1, inplace=True) test_data = test_data[train_data.columns] # 获取非数值列 s = train_data.dtypes tecols = s[s == 'object'].index.tolist() # 将非数值列直接利用TargetEncoder进行离散化编码 te = TargetEncoder(cols=tecols) tf = te.fit_transform(train_data, target) df = te.transform(test_data) # 划分训练集和验证集 X_train_split, X_val, y_train_split, y_val = train_test_split(tf, target, test_size=0.2) train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) """使用优化后的参数初始化模型(参数通过网格搜索法进行优化,这里没有相关代码)""" base_params_lgb = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 14,
test_data["Hair Color"] = test_data["Hair Color"].fillna("N/A") test_data["Profession"] = test_data["Profession"].fillna("N/A") #test_numerics test_data["Age"] = test_data["Age"].fillna(test_data['Age'].mean()) test_data["Year of Record"] = test_data["Year of Record"].fillna( test_data['Year of Record'].mean()) numerics_and_catagorics = list(train_data.columns) X_train_data, X_val, y_train_data, y_val = train_test_split(train_data, target, test_size=0.2, random_state=97) random_forest = RandomForestRegressor() #target_encoder target_encoder = TargetEncoder() train_data_encoded = target_encoder.fit_transform( X_train_data[numerics_and_catagorics], y_train_data) test_data_encoded = target_encoder.transform(X_val[numerics_and_catagorics], y_val) test_data = target_encoder.transform(test_data[numerics_and_catagorics]) print(train_data_encoded.head()) print(test_data_encoded.head()) #random_forest_regressor r_train_data = random_forest.fit(train_data_encoded, y_train_data) prediction = r_train_data.predict(test_data_encoded) def rmse(prediction, target): #difference = prediction-target #square = **2
colsample_bytree=.2, reg_alpha=.1, reg_lambda=.1) return lgbr # 本地验证 kf = KFold(n_splits=10, shuffle=True, random_state=100) devscore = [] for tidx, didx in kf.split(train.index): tf = train.iloc[tidx] df = train.iloc[didx] tt = target.iloc[tidx] dt = target.iloc[didx] te = TargetEncoder(cols=tecols) tf = te.fit_transform(tf, tt) df = te.transform(df) lgbr = makelgb() lgbr.fit(tf, tt) pre = lgbr.predict(df) fpr, tpr, thresholds = roc_curve(dt, pre) score = auc(fpr, tpr) devscore.append(score) print(np.mean(devscore)) # 在整个train集上重新训练,预测test,输出结果 lgbr = makelgb() te = TargetEncoder(cols=tecols) tf = te.fit_transform(train, target) df = te.transform(test) lgbr.fit(tf, target)