class LogisticModelBuilder(object): def __init__(self): self.inter_levels = None self.dicts_rep = None self.dict_vectorizer = DictVectorizer() self.ff_model = None self.model = None def set_data(self, user_atts, inter_atts, responses): self.build_data_representations(user_atts, inter_atts) # Convert from dict representation into matrix: predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray() print(predictor_rows) print('Finding optimal feature set...') self.ff_model = RandomizedLogisticRegression() # Finds best set of features # Fit data and get transformed input rows: X_new = self.ff_model.fit_transform(predictor_rows, responses) print(X_new) print('Done! Final Shape: ' + str(X_new.shape)) print('Building Final model...') self.model = LogisticRegression().fit(X_new, responses) print('Done!') # Set data based on tuples/rows def set_data_rows(self, tuples): self.set_data(*ut.unzip(tuples)) # Builds a list-of-dictionaries representation and builds # msg/interaction factor level matrix. def build_data_representations(self, user_atts, inter_atts): print('Building internal data representations...') print(' Building factor level matrix...') itp = map(lambda x: set(x), zip(*inter_atts)) # transpose and get row sets self.inter_levels = map(lambda x: x if len(filter(lambda y: type(y) == type(''), x)) > 0 else (min(x), max(x)), itp) print(' Building dict list representation...') self.dicts_rep = dict_list_representation(user_atts, inter_atts) print('Done!') # Returns a function of form f: X x Y -> P # where X = <user_att vals>, Y = <inter. att vals>, and P = P(R = 1) def prob_f(self): dv = self.dict_vectorizer dlr = lambda x, y: dict_list_representation([x], [y]) ff = self.ff_model mod = self.model f = lambda X, Y: mod.predict_proba(ff.transform(dv.transform(dlr(X, Y)).toarray())) return lambda X, Y: map(lambda z: z[1], f(X, Y))[0] # Return a vector of interaction attribute levels corresponding to each # interaction attribute. For each attribute the following rule is applied: # 1) If the attribute is categorical the attribute levels are a list of unique values # 2) If the attribute is numeric then a pair (min, max) is returned bounding the values. def inter_attr_levels(self): return map(lambda lv: lv if type(lv) == type(()) else list(lv), self.inter_levels)
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5): # Create the random forest object which will include all the parameters # for the fit randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200) # Fit the training data to the Survived labels and create the decision trees randomlr = randomlr.fit(train_x,train_y) train_x = randomlr.fit_transform(train_x,train_y) cv_x = randomlr.transform(cv_x) test_x = randomlr.transform(test_x) return train_x,cv_x,test_x
def hyperparameterSearch(training_set_path, cat, rl, bu): print("Importing descriptors from the training set.") X, y, labels = import_descriptors( training_set_path, "*_%s_%s_train_descriptors_N20.txt" % (rl, bu)) print("Number of features: %d." % X.shape[-1]) print("Scaling data.") min_max_scaler = MinMaxScaler() X_scale = min_max_scaler.fit_transform(X.todense()) print("Performing feature selection with randomized logistic regression.") # set n_jobs=-1 to parallelize the Randomized Logistic Regression # however, there is a bug in the current version of skitlearn (0.18.1) which results in the following message: # ValueError: assignment destination is read-only, when parallelizing with n_jobs > 1 feature_selector = RandomizedLogisticRegression(n_jobs=1) X_scale = feature_selector.fit_transform(X_scale, y) print("Reduced number of features: %d." % X_scale.shape[-1]) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the RBF kernel." ) param_dist_rbf = { 'kernel': ['rbf'], 'C': expon(scale=2000), 'gamma': expon(scale=.01) } random_sv_rbf = RandomizedSearchCV(SVC(), param_distributions=param_dist_rbf, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_rbf.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the linear kernel." ) param_dist_linear = {'C': expon(scale=2000)} random_sv_linear = RandomizedSearchCV( LinearSVC(), param_distributions=param_dist_linear, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_linear.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the polynomial kernel." ) param_dist_poly = { 'kernel': ['poly'], 'C': expon(scale=2000), 'degree': randint(2, 11), 'coef0': uniform(loc=-2, scale=4), 'gamma': expon(scale=.01) } random_sv_poly = RandomizedSearchCV(SVC(), param_distributions=param_dist_poly, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_poly.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the sigmoid kernel." ) param_dist_sigmoid = { 'kernel': ['sigmoid'], 'C': expon(scale=2000), 'coef0': uniform(loc=-2, scale=4), 'gamma': expon(scale=.01) } random_sv_sigmoid = RandomizedSearchCV( SVC(), param_distributions=param_dist_sigmoid, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_sigmoid.fit(X_scale, y, groups=labels) with open( "%sbest_parameters_test_%s_%s_%s.txt" % (training_set_path, cat, rl, bu), "w") as best_params: extracted_features = [ "%d" % (x + 1) for x in feature_selector.get_support(indices=True) ] print( "Best parameters found on training set with the RBF kernel:\n%s %s" % (random_sv_rbf.best_params_, random_sv_rbf.best_score_)) best_params.write( "Best parameters found on training set with the RBF kernel:\n%s %s\n" % (random_sv_rbf.best_params_, random_sv_rbf.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_rbf.best_params_["kernel"])) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_rbf.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_rbf.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_rbf.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_rbf.best_params_["gamma"])) best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_rbf.best_params_["gamma"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_rbf.cv_results_['mean_test_score'] stds = random_sv_rbf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_rbf.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the linear kernel:\n%s %s" % (random_sv_linear.best_params_, random_sv_linear.best_score_)) best_params.write( "Best parameters found on training set with the linear kernel:\n%s %s\n" % (random_sv_linear.best_params_, random_sv_linear.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, 'linear')) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, 'linear')) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_linear.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_linear.best_params_["C"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_linear.cv_results_['mean_test_score'] stds = random_sv_linear.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_linear.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the polynomial kernel:\n%s %s" % (random_sv_poly.best_params_, random_sv_poly.best_score_)) best_params.write( "Best parameters found on training set with the polynomial kernel:\n%s %s\n" % (random_sv_poly.best_params_, random_sv_poly.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_poly.best_params_["kernel"])) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_poly.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["gamma"])) best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["gamma"])) print("degree[(\"%s\", \"%s\", \"%s\")] = %d" % (cat, rl, bu, random_sv_poly.best_params_["degree"])) best_params.write("degree[(\"%s\", \"%s\", \"%s\")] = %d\n" % (cat, rl, bu, random_sv_poly.best_params_["degree"])) print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["coef0"])) best_params.write("coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["coef0"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_poly.cv_results_['mean_test_score'] stds = random_sv_poly.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_poly.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the sigmoid kernel:\n%s %s" % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_)) best_params.write( "Best parameters found on training set with the sigmoid kernel:\n%s %s\n" % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"])) best_params.write( "\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"])) best_params.write( "gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"])) print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"])) best_params.write( "coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_sigmoid.cv_results_['mean_test_score'] stds = random_sv_sigmoid.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_sigmoid.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params))
# Useful sources: # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV from sklearn.datasets import load_iris import numpy as np iris = load_iris() X, y = iris.data, iris.target print(X) print(y) ff_model = RandomizedLogisticRegression() # Finds best set of features X_new = ff_model.fit_transform(X, y) # Fit data and get transformed input rows print(X_new) print(X.shape) print(X_new.shape) print(X[0:4]) print(ff_model.transform( X[0:4])) # Transform the first 4 rows of data to get only best features model = LogisticRegression().fit( X_new, y) # Fit logistic regression with best features print(model.predict_proba(ff_model.transform( X[0:4]))) # predict probabilities for first 4 rows of data print(ff_model.inverse_transform(ff_model.transform( X[0:4]))) # Test inverse transforming arr = np.array([[1, 1, 1]]) print( ff_model.inverse_transform(arr) ) # Get original matrix structure with 1's only in columns of retained features.
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl, sel, paramsDict, bestmodelnum): print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM, TEST_PERSON_DEVICE_ID)) X_train_allfg = featmat_train.values Y_train = outcome_train_lbl.values # Y_train = Y_train.reshape(Y_train.size, 1)# does this help? featnames_allfg = featmat_train.columns X_test_allfg = featmat_test.values Y_test = outcome_test_lbl.values Y_true = Y_test[0] sel_featnames_per_fg = {} sel_featnames_list_ordered = [] sel_X_train = [] sel_X_test = [] countNumSel = 0 fgi = 0 for s in suffix_list: fgi = fgi + 1 # print fgi, suffix_list_str = ",".join(s) fgidxs = fgColIdxs[suffix_list_str] X_train = X_train_allfg[:, fgidxs] X_test = X_test_allfg[:, fgidxs] featnames_fg = featnames_allfg[fgidxs] # continue if empty if X_train.shape[1] == 0: continue ## scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # variance thresholding vartransform = VarianceThreshold() X_train = vartransform.fit_transform(X_train) X_test = vartransform.transform(X_test) varthres_support = vartransform.get_support() featnames_fg = featnames_fg[varthres_support] ## feature selection if sel == "rlog": #print (X_train.shape) randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train = randomized_rlog.fit_transform(X_train, Y_train) X_test = randomized_rlog.transform(X_test) chosen_col_idxs = randomized_rlog.get_support() #print (len(featnames_fg)) #print (len(chosen_col_idxs)) if len(chosen_col_idxs) > 0: featnames_fg_chosen = list(featnames_fg[chosen_col_idxs]) sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen sel_X_train.append(X_train) sel_X_test.append(X_test) countNumSel = countNumSel + len(featnames_fg_chosen) else: raise ("Unrecognized sel (feature selection algorithm)") ## feature selection: sel{sel{fg1}.....sel{fg45}} X_train_concat = np.hstack(sel_X_train) X_test_concat = np.hstack(sel_X_test) print("\nSum of number of features selected from all fgs = {0}".format( countNumSel)) print("Concatenated X_train has {0} features".format( X_train_concat.shape[1])) print("Concatenated X_test has {0} features".format( X_test_concat.shape[1])) if sel == "rlog": randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train) X_test_concat = randomized_rlog.transform(X_test_concat) chosen_col_idxs = randomized_rlog.get_support() sel_featnames_list_ordered = np.array(sel_featnames_list_ordered) chosen_col_idxs = np.array(chosen_col_idxs) chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs] else: raise ("Unrecognized sel (feature selection algorithm)") print("Final number of features in model = {0}".format( X_train_concat.shape[1])) # GBCT if modelname == "GBC": clf = GradientBoostingClassifier(random_state=0) elif modelname == "LOGR": clf = LogisticRegression(random_state=0, C=paramsDict["C"], tol=1e-3, penalty="l1", n_jobs=paramsDict["n_jobs"], intercept_scaling=1, class_weight="balanced") else: raise ("Unrecognized model name") clf.fit(X_train_concat, Y_train) pred = clf.predict(X_test_concat) pred_proba = clf.predict_proba(X_test_concat) Y_pred = pred[0] Y_pred_proba = pred_proba[0][1] ## Logging test_person_test.csv - outputs 1 line only ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns chosen_cols_final_str = ",".join(chosen_cols_final) paramsDict_str = ','.join("%s:%r" % (key, val) for (key, val) in paramsDict.iteritems()) fgIdxs_str = ','.join("%s:%r" % (key, val) for (key, val) in fgIdxs.iteritems()) cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl) cnts_per_lbl_str = ','.join("%s:%r" % (key, val) for (key, val) in cnts_per_lbl_dict.iteritems()) dfout = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "cnts_per_lbl": [cnts_per_lbl_str], "sel": [sel], "selParams": [paramsDict_str], "Y_pred": [Y_pred], "Y_pred_proba": [Y_pred_proba], "Y_true": [Y_true], "fgIdxs": [fgIdxs_str], "sel_final": [chosen_cols_final_str] }) dfout = dfout.set_index("did") cols = [ "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true", "fgIdxs", "sel_final" ] for s in suffix_list: suffix_list_str = ",".join(s) if suffix_list_str in sel_featnames_per_fg: sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str]) else: sel_feats_fg_str = "" dfcol = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "sel_{0}".format(suffix_list_str): [sel_feats_fg_str] }) dfcol = dfcol.set_index("did") dfout = pd.concat([dfout, dfcol], axis=1) cols.append("sel_{0}".format(suffix_list_str)) dfout.to_csv( folderpath + "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum), columns=cols, header=True) print("{0} minutes elapsed since start of program ".format( (time.time() - STARTTIME) / 60.0)) return (Y_pred, Y_pred_proba)
Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) featureNames = featureNames[Fwe.get_support()] print("F-test filter ->", X.shape) FeatSelection_SVM = True FeatSelection_RandLogReg = False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2, n_jobs=-1) X = LogRegFeats.fit_transform(X, y) featureNames = featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:", X.shape) elif FeatSelection_SVM == True: X = LinearSVC(C=1, penalty="l1", dual=False, class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames = featureNames[LogRegFeats.get_support()] print("SVC Transformed X:", X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) ''' KFilt = None
#选择标准差超过0.5的特征 large_std_features_index = [ i for i in range(len(features_std)) if features_std[i] > 0.5 ] X2 = X[:, large_std_features_index] #第2步:利用Lasso约束下的逻辑回归模型进行变量挑选 #先在验证集上找出最好的参数C auc_list = [] for Ci in list(range(1, 101)): X21, X22, y21, y22 = model_selection.train_test_split(X2, y, test_size=0.2) lr = RandomizedLogisticRegression(C=Ci) # 可在此步对模型进行参数设置 lr.fit(X21, y21) # 训练模型,传入X、y, 数据中不能包含miss_value X_new = lr.inverse_transform(lr.fit_transform(X21, y21)) #找出X_new中不全部为0的列 zero_columns = np.sum(np.abs(X_new), axis=0) nonzero_columns_index = [ i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001 ] X3 = X21[:, nonzero_columns_index] lr_best = LogisticRegression() lr_best.fit(X21, y21) prob_predict = lr_best._predict_proba_lr(X22)[:, 1] auc = metrics.auc(y22, prob_predict, reorder=True) auc_list.append(auc) best_C_position = auc_list.index(max(auc_list)) best_C = list(range(1, 101))[best_C_position]
print "classifier:", cv.std(), cv.mean() print "majority base:", accuracy_score(labEnc.transform(labels), labEnc.transform(maj)) print "random base:", accuracy_score(labEnc.transform(labels), labEnc.transform(rand)) if args.coef: # Output file_basename = args.output sel = RandomizedLogisticRegression(n_jobs=10, n_resampling=args.iterations, sample_fraction=0.75, verbose=2) new_X = sel.fit_transform(X, enclabels) clf = LogisticRegression(class_weight='auto') clf.fit(new_X, enclabels) # this one does not get the probs # selected_feature_names = np.asarray(vectorizer.get_feature_names())[np.flatnonzero(clf.coef_[0])] # selected_feature_probs = clf.coef_[0][np.flatnonzero(clf.coef_[0])] # this one gets probs, but introduces a mismatch # selected_feature_names = np.asarray(vectorizer.get_feature_names())[np.flatnonzero(sel.scores_)] # selected_feature_probs = sel.scores_[np.flatnonzero(sel.scores_)] # this one works, it seems active_feature_mask = sel.get_support() selected_feature_names = np.asarray(
# Useful sources: # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV from sklearn.datasets import load_iris import numpy as np iris = load_iris() X, y = iris.data, iris.target print(X) print(y) ff_model = RandomizedLogisticRegression() # Finds best set of features X_new = ff_model.fit_transform(X, y) # Fit data and get transformed input rows print(X_new) print(X.shape) print(X_new.shape) print(X[0:4]) print(ff_model.transform(X[0:4])) # Transform the first 4 rows of data to get only best features model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming arr = np.array([[1,1,1]]) print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
# 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("F-test filter ->",X.shape) FeatSelection_SVM=True FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1) X = LogRegFeats.fit_transform(X,y) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X.shape) elif FeatSelection_SVM == True: X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames=featureNames[LogRegFeats.get_support()] print ("SVC Transformed X:",X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) ''' KFilt=None