def preprocess_xy(self, label): x, y = pc.get_filtered_x_y(self.feature_set_df, self.profile_df, label) x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) else: x_imp = x y_filtered = y[(map(int, x.columns.values))] return x_imp, y_filtered
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'): # print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered.dropna(how='all') x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values clf = RandomizedLogisticRegression() # print "\t\t\tfitting LR model..." clf.fit(x_imp.T, y_v) feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance']) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def compute(x): x_imp = pc.fill_nan_features(x) try: m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \ else ExtraTreesClassifier(n_estimators=n_est, max_depth=3) print "\t\t\tfitting RF model..." m.fit(x_imp.T, y_v) # if len(feature_mics) > 1000: # break # print m.feature_importances_ for order, index in enumerate(x.index): feature_importances.loc[index] = m.feature_importances_[order] if float(order) % 10000 == 0 and order > 0: print "\t\t\t%s features are done" % order except ValueError as e: # print "value error occurs during processing %r" % index pass
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender', classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR', cv=10): instance_num = len(data_set_df.columns) df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered if features is None else df_filtered.loc[features] x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) # x_imp = dense_df.loc[x.index, x.columns] else: x_imp = x y_filtered = y_v[(map(int, x.columns.values))] clf = LogisticRegression(C=reg_param) if classifier is None else classifier cv_num = min(len(y_filtered), cv) score_mean = 0.0 miss_clf_rate = 1.0 if cv_num > 1 and len(y_filtered.unique()) > 1: kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True) # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True) fold = 0 result_str = "" matrix_str = "" for tr_index, te_index in kf: fold += 1 x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index] y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index] if selection: if sel_method == 'LR' or 'RF' in sel_method: feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat, method=sel_method, label=label) else: x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat, method=sel_method, label=label) x_train = x_train.loc[:, feat_index].values x_test = x_test.loc[:, feat_index].values try: clf.fit(x_train, y_train) score = clf.score(x_test, y_test) score_mean += score result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \ % (label, True if param.FILL_SUFFIX in feat_set_name else False, True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR', reg_param, cv, fold, x_train.shape[1], score) cf_mat = confusion_matrix(y_test, clf.predict(x_test), labels=range(len(info.LABEL_CATEGORY[label]))) matrix_str += np.array_str(cf_mat) + "\n" except ValueError: pass # traceback.print_exc() # print i, "why error? skip!" print result_str file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(result_str) file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(matrix_str) if fold > 0: score_mean = score_mean / fold miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num) return score_mean, miss_clf_rate