def fisherProc(X,y): # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) return idx
def fisher(data): rank = [] for i in range(6): X = data[i][:, :-1] Y = data[i][:, -1] score = fisher_score.fisher_score(X, Y) idx1 = fisher_score.feature_ranking(score) idx = samp(idx1.tolist()) rank.append(idx) R = rankaggregate(rank) return R
def seleciona_caracteristicas(vetor_caracteristicas, classes): caracteristicas_selecionadas = [] limiar_consideracao = 0 score = fisher_score.fisher_score(vetor_caracteristicas, classes) rank = fisher_score.feature_ranking(score) features_consideradas = conta_features_limiar(score, limiar_consideracao) if features_consideradas > 1: rank_considerado = rank[0:features_consideradas:1] caracteristicas_selecionadas = vetor_caracteristicas[:, rank_considerado] return caracteristicas_selecionadas, rank_considerado
def seleciona_caracteristicas(vetor_caracteristicas, classes): caracteristicas_selecionadas = [] limiar_consideracao = 0 score = fisher_score.fisher_score(vetor_caracteristicas, classes) rank = fisher_score.feature_ranking(score) features_consideradas = conta_features_limiar(score, limiar_consideracao) if features_consideradas > 1: rank_considerado = rank[0:features_consideradas:1] caracteristicas_selecionadas = vetor_caracteristicas[:, rank_considerado] return caracteristicas_selecionadas, rank_considerado
def get_fisher_scores(self, max_dim): """ Получить меру Фишера и качество распознавания на основе AUC ROC. Выполняется отбор признаков для размерностей пространства признаков от 1 до max_dim. Для каждой размерности выполняется перекрестная проверка (cross-validation) и вычисляется интегральное значение меры Фишера и среднее по всем подвыборкам значение меры AUC ROC. Args: max_dim(int): число признаков до которого следует производить отбор. Returns: fisher_summary_scores: - вычисленные суммарные значения меры Фишера. auc_roc_scores: - вычисленные значения площади под кривой ROC. """ x_train = scale(self.features) # normalize features y_train = self.targets # target ids # Fisher score estimation f_score = fisher_score.fisher_score( x_train, y_train) # calculate Fisher score value ranked_f_score = fisher_score.feature_ranking(f_score) # rank features print('Последовательность отобранных коэффициентов:') print(*list(self.feature_header[ranked_f_score[0:max_dim]]), sep=', ') fisher_summary_scores = list( it.accumulate( f_score[ranked_f_score[0:max_dim]])) # integral Fisher scores # Cross validation k_fold = KFold(n_splits=5, shuffle=True) # setup cross-validation pattern ar_scorer = make_scorer(roc_auc_score) # make scorer clf = SGDRegressor(max_iter=100, tol=1e-3, random_state=241 ) # stochastic gradient descend regression as a clf auc_roc_scores = [] # list for AUC ROC values for i in range(1, max_dim + 1): # iterate by number of features selected features = x_train[:, ranked_f_score[0:i]] # select features t = y_train vect_auc_roc_score = cross_val_score(clf, features, t, scoring=ar_scorer, cv=k_fold) # train auc_roc_scores.append(np.mean(vect_auc_roc_score) ) # add mean (over CV-subsets) AUC ROC value return fisher_summary_scores, auc_roc_scores
def get_fisher_score(data,label,k = 30): score = fisher_score.fisher_score(data, label) #print(score) ranking = fisher_score.feature_ranking(score) #print(idx) dfscores = pd.DataFrame(score) dfcolumns = pd.DataFrame(data.columns) #df_rank =pd.DataFrame(idx) featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Feature','Score'] #naming the dataframe columns #print(featureScores.nlargest(k,'Score')) #print 20 best features result = featureScores.nlargest(k,'Score') return result, ranking
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def naiveBayes(processed_train_features, processed_valid_features, train_labels, valid_labels, processed_test_features, test_labels): model1 = GaussianNB() model1.fit(processed_train_features, train_labels) naive_bayes_predict_train = model1.predict(processed_train_features) naive_bayes_predict_valid = model1.predict(processed_valid_features) #print("Naive Bayes Training accuracy ",accuracy_score(train_labels, naive_bayes_predict_train)) print("Naive Bayes Valid accuracy ", accuracy_score(valid_labels, naive_bayes_predict_valid)) naive_bayes_predict_train_before_fisher = model1.predict( processed_test_features) print("Naive Bayes Testing accuracy ", accuracy_score(test_labels, naive_bayes_predict_train_before_fisher)) XFisher = processed_test_features.to_numpy() score = fs.fisher_score(XFisher, test_labels) ranked_featrues = fs.feature_ranking(score) topFeatures = ranked_featrues[:50] print(topFeatures) print(score.shape) print(XFisher.shape) intersection_cols = topFeatures colnamelist = [] for i in topFeatures: colname = processed_train_features.columns[i] colnamelist.append(colname) test = processed_test_features.copy() valid_for_bayes = processed_valid_features.copy() size = 188 test.drop(test.columns.difference(colnamelist), 1, inplace=True) valid_for_bayes.drop(valid_for_bayes.columns.difference(colnamelist), 1, inplace=True) model = GaussianNB() model.fit(test, test_labels) naive_bayes_predict_train_after_fisher = model.predict(test) print("Naive Bayes Testing accuracy ", accuracy_score(test_labels, naive_bayes_predict_train_after_fisher)) naive_bayes_predict_valid_after_fisher = model.predict(valid_for_bayes) print("Naive Bayes Validation accuracy", accuracy_score(valid_labels, naive_bayes_predict_valid_after_fisher))
def main(): # load data mat = scipy.io.loadmat("../data/COIL20.mat") X = mat["X"] # data X = X.astype(float) y = mat["Y"] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = fisher_score.fisher_score(X[train], y[train]) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print "Accuracy:", float(correct) / 10
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = fisher_score.fisher_score(X[train], y[train]) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print('Accuracy:', old_div(float(correct), 10))
def rank_features_using_fisherscore(cls, data_frame, target_key, cols_to_ignore=None): X = data_frame.values keys = list(data_frame.keys()) target_col_idx = keys.index(target_key) # Removing the target column from keys del keys[target_col_idx] # Remove all columns that are asked to be ignored if cols_to_ignore is not None: for col in cols_to_ignore: idx = keys.index(col) del keys[idx] Y = data_frame.loc[:, target_key].values X = data_frame.loc[:, keys] score = fisher_score.fisher_score(X, Y) rank = fisher_score.feature_ranking(score) ranked_features = [keys[i] for i in rank] return score, ranked_features, keys
def create_data_aware_features(self, train_log, test_log, ignored): # given log # 0.0. Extract events # 1.1. Apriori mine events to be used for constraints # 2. Find declare constraints to be used, On a limited set of declare templates # 2.1. Find support for all positive and negative cases for the constraints # 2.2. Filter the constraints according to support # -- Encode the data # 3. Sort constraints according to Fisher score (or other metric) # 4. Pick the constraint with highest Fisher score. # 5. Refine the constraint with data # 5.1. Together with data, try to create a better rule. # ---- In this case, every node will become a small decision tree of its own! # 5.2. If the Fisher score of new rule is greater, change the current rule to a refined rule # --- Refined rule is - constraint + a decision rules / tree, learne # Reorder constraints for next level of decision tree .. It is exactly like Gini impurity or sth.. # Get templates from fabrizios article """ responded existence(A, B), data on A response(A, B), data on A precedence(A, B), data on B alternate response(A, B), data on A alternate precedence(A, B), data on B chain response(A,B), data on A chain precedence(A, B), data on B not resp. existence (A, B), data on A not response (A, B), data on A not precedence(A, B), data on B not chain response(A,B), data on A not chain precedence(A,B), data on B :param log: :param label: :return: """ not_templates = [ "not_responded_existence", "not_precedence", "not_response", "not_chain_response", "not_chain_precedence" ] templates = [ "alternate_precedence", "alternate_response", "chain_precedence", "chain_response", "responded_existence", "response", "precedence" ] inp_templates = templates + not_templates # play around with thresholds constraint_threshold = 0.1 candidate_threshold = 0.1 # Extract unique activities from log events_set = extract_unique_events_transformed(train_log) # Brute force all possible candidates candidates = [(event, ) for event in events_set] + [ (e1, e2) for e1 in events_set for e2 in events_set if e1 != e2 ] # Count by class normal_count, deviant_count = count_classes(train_log) print("{} deviant and {} normal traces in train set".format( deviant_count, normal_count)) ev_support_norm = int(normal_count * candidate_threshold) ev_support_dev = int(deviant_count * candidate_threshold) print("Filtering candidates by support") candidates = filter_candidates_by_support(candidates, train_log, ev_support_norm, ev_support_dev) print("Support filtered candidates:", len(candidates)) constraint_support_dev = int(deviant_count * constraint_threshold) constraint_support_norm = int(normal_count * constraint_threshold) train_results = generate_train_candidate_constraints( candidates, inp_templates, train_log, constraint_support_norm, constraint_support_dev, filter_t=True) test_results = generate_test_candidate_constraints( candidates, inp_templates, test_log, train_results) print("Candidate constraints generated") ## Given selected constraints, find fulfillments and violations for each of the constraint. ## In this manner build positive and negative samples for data X_train, y_train, feature_names, train_trace_names = transform_results_to_numpy( train_results, train_log) X_test, y_test, _, test_trace_names = transform_results_to_numpy( test_results, test_log) # Turn to pandas df train_df = pd.DataFrame(X_train, columns=feature_names, index=train_trace_names) train_df = train_df.transpose().drop_duplicates().transpose() # remove no-variance, constants train_df = train_df.loc[:, (train_df != train_df.iloc[0]).any()] X_train = train_df.values # Perform selection by Fisher scores = fisher_calculation(X_train, y_train) selected_ranks = fisher_score.feature_ranking(scores) threshold = 15 #chosen = 500 real_selected_ranks = [] # Start selecting from selected_ranks until every trace is covered N times trace_remaining = dict() for i, trace_name in enumerate(train_df.index.values): trace_remaining[i] = threshold chosen = 0 # Go from higher to lower for rank in selected_ranks: if len(trace_remaining) == 0: break chosen += 1 # Get column marked_for_deletion = set() added = False for k in trace_remaining.keys(): if train_df.iloc[k, rank] > 0: if not added: added = True real_selected_ranks.append(rank) trace_remaining[k] -= 1 if trace_remaining[k] <= 0: marked_for_deletion.add(k) for k in marked_for_deletion: del trace_remaining[k] print("Constraints chosen {}".format(len(real_selected_ranks))) feature_names = train_df.columns[real_selected_ranks] print("Considered template count:", len(feature_names)) train_df = train_df[feature_names] new_train_feature_names = [] new_train_features = [] new_test_feature_names = [] new_test_features = [] count = 0 for key in train_df.columns: count += 1 #print(key) # Go over all and find with data template = key[0] candidate = key[1] # First have to find all locations of fulfillments outp_train = find_fulfillments_violations(candidate, template, train_log) outp_test = find_fulfillments_violations(candidate, template, test_log) # Take data snapshots on all fulfilled indices - positives samples # Take data snapshots on all unfulfilled indices - negative samples # Build a decision tree with fulfilled and unfulfilled samples train_positive_samples = [] train_negative_samples = [] test_positive_samples = [] test_negative_samples = [] for i, trace in enumerate(outp_train): fulfilled = trace[1] violated = trace[2] positive, negative = get_data_snapshots( train_log[i], fulfilled, violated) label = train_log[i]["label"] for s in positive: train_positive_samples.append((s, label, i)) for s in negative: train_negative_samples.append((s, label, i)) for i, trace in enumerate(outp_test): fulfilled = trace[1] violated = trace[2] positive, negative = get_data_snapshots( test_log[i], fulfilled, violated) label = train_log[i]["label"] for s in positive: test_positive_samples.append((s, label, i)) for s in negative: test_negative_samples.append((s, label, i)) # Get all where fulfilled only. Train on train_positive_samples vs Label of log ignored_features = set(ignored) # set([('Diagnose', 'literal')]) collected_features = set() # Get all possible features for for pos_act, _, __ in train_positive_samples: for key2, val in pos_act.items(): collected_features.add(key2) for neg_act, _, __ in train_negative_samples: for key2, val in neg_act.items(): collected_features.add(key2) features = list(collected_features) # Keep only features of boolean, literal, continuous and discrete features = [ feature for feature in features if feature[1] in set( ["boolean", "continuous", "discrete", "literal"]) ] features = [ feature for feature in features if feature[0] not in ignored_features ] # collect positive and negative samples for finding data condition: positive_samples = [(sample[2], sample[0]) for sample in train_positive_samples if sample[1] == 1] negative_samples = [(sample[2], sample[0]) for sample in train_positive_samples if sample[1] == 0] pos_activations = [(sample[2], sample[0]) for sample in train_positive_samples] neg_activations = [(sample[2], sample[0]) for sample in train_negative_samples] feature_train_samples = self.create_sample( pos_activations, features, 1) + self.create_sample( neg_activations, features, 0) # Crete pos and neg samples pos_samples = self.create_sample(positive_samples, features, 1) neg_samples = self.create_sample(negative_samples, features, 0) features_data = pos_samples + neg_samples features_label = ["id"] + features + ["Label"] # one-hot encode literal features literal_features = [ feature for feature in features if feature[1] == "literal" ] # Extract positive test samples, where fulfillments where fulfilled train_df = pd.DataFrame(features_data, columns=features_label) test_pos_smpl = [ (sample[2], sample[0]) for sample in test_positive_samples ] # if sample[1] == 1] test_neg_smpl = [ (sample[2], sample[0]) for sample in test_negative_samples ] # if sample[1] == 0] pos_test_samples = self.create_sample(test_pos_smpl, features, 1) neg_test_samples = self.create_sample(test_neg_smpl, features, 0) test_features_data = pos_test_samples + neg_test_samples feature_train_df = pd.DataFrame(feature_train_samples, columns=features_label) test_df = pd.DataFrame(test_features_data, columns=features_label) train_df.pop("id") train_ids = feature_train_df.pop("id") test_ids = test_df.pop("id") # Possible values for each literal value is those in train_df or missing if len(literal_features) > 0: for selection in literal_features: train_df[selection] = pd.Categorical(train_df[selection]) test_df[selection] = pd.Categorical(test_df[selection]) feature_train_df[selection] = pd.Categorical( feature_train_df[selection]) le = LabelEncoder() le.fit( list(test_df[selection]) + list(feature_train_df[selection])) classes = le.classes_ train_df[selection] = le.transform(train_df[selection]) test_df[selection] = le.transform(test_df[selection]) feature_train_df[selection] = le.transform( feature_train_df[selection]) ohe = OneHotEncoder( categories="auto") # Remove this for server. ohe.fit( np.concatenate((test_df[selection].values.reshape( -1, 1), feature_train_df[selection].values.reshape( -1, 1)), axis=0), ) train_transformed = ohe.transform( train_df[selection].values.reshape(-1, 1)).toarray() test_transformed = ohe.transform( test_df[selection].values.reshape(-1, 1)).toarray() feature_train_transformed = ohe.transform( feature_train_df[selection].values.reshape( -1, 1)).toarray() dfOneHot = pd.DataFrame( train_transformed, columns=[(selection[0] + "_" + classes[i], selection[1]) for i in range(train_transformed.shape[1])]) train_df = pd.concat([train_df, dfOneHot], axis=1) train_df.pop(selection) dfOneHot = pd.DataFrame( test_transformed, columns=[(selection[0] + "_" + classes[i], selection[1]) for i in range(train_transformed.shape[1])]) test_df = pd.concat([test_df, dfOneHot], axis=1) test_df.pop(selection) dfOneHot = pd.DataFrame( feature_train_transformed, columns=[(selection[0] + "_" + classes[i], selection[1]) for i in range(train_transformed.shape[1])]) feature_train_df = pd.concat([feature_train_df, dfOneHot], axis=1) feature_train_df.pop(selection) data_dt = DecisionTreeClassifier(max_depth=3) y_train = train_df.pop("Label") train_data = train_df.values y_test = test_df.pop("Label") data_dt.fit(train_data, y_train) y_train_new = feature_train_df.pop("Label") feature_train_data = feature_train_df.values train_predictions = data_dt.predict(feature_train_data) test_predictions = data_dt.predict(test_df.values) train_fts = feature_train_df.columns # Go through all traces again # Save decision trees here. For later interpretation feature_train_df["id"] = train_ids test_df["id"] = test_ids feature_train_df["prediction"] = train_predictions test_df["prediction"] = test_predictions # Check for which activations the data condition holds. Filter everything else out. feature_train_df["Label"] = y_train_new test_df["Label"] = y_test new_train_feature = [] for i, trace in enumerate(outp_train): # Get from train_df by number trace_id = i freq = trace[0] # Find all related to the id if freq == 0: # vacuous case, no activations, will be same here. new_train_feature.append(0) else: # Previous violation case # Find samples related to trace samples = feature_train_df[feature_train_df.id == trace_id] # Find samples related for which data condition holds samples = samples[samples.prediction == 1] # Count number of positive and negative labels positive = samples[samples.Label == 1].shape[0] negative = samples[samples.Label == 0].shape[0] if negative > 0: new_train_feature.append(-1) else: new_train_feature.append(positive) new_test_feature = [] for i, trace in enumerate(outp_test): # Get from train_df by number trace_id = i freq = trace[0] # Find all related to the id if freq == 0: # vacuous case, no activations, will be same here. new_test_feature.append(0) else: # Previous violation case # Find samples related to trace samples = test_df[test_df.id == trace_id] # Find samples related for which data condition holds samples = samples[samples.prediction == 1] # Count number of positive and negative activations positive = samples[samples.Label == 1].shape[0] negative = samples[samples.Label == 0].shape[0] if negative > 0: new_test_feature.append(-1) else: new_test_feature.append(positive) # Find all activatio count_fulfilled_train = sum(1 for i in new_train_feature if i > 0) count_fulfilled_test = sum(1 for i in new_test_feature if i > 0) if count_fulfilled_train > 0 and count_fulfilled_test > 0: # only then add new feature.. new_train_features.append(new_train_feature) new_train_feature_names.append( template + ":({},{}):Data".format(candidate[0], candidate[1])) new_test_features.append(new_test_feature) new_test_feature_names.append( template + ":({},{}):Data".format(candidate[0], candidate[1])) # Save decision tree save_dt = False if save_dt: export_graphviz( data_dt, out_file="sample_dwd_trees/outputfile_{}.dot".format( str(key)), feature_names=list(map(str, train_fts))) return new_train_feature_names, new_train_features, new_test_feature_names, new_test_features
X = dataset.iloc[:, 2: 32] # [all rows, col from index 2 to the last one excluding 'Unnamed: 32'] y = dataset.iloc[:, 1] # [all rows, col one only which contains the classes of cancer] labelencoder_Y = LabelEncoder() y = labelencoder_Y.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = X_train.values X_test = X_test.values # compute fisher scores score = fisher_score(X_train, y_train) idx = feature_ranking(score) np.save('features/fisher.npy', idx) print('Features saved') #idx = np.load('features/fisher.npy') # create copies of the data X_train_copy = X_train y_train_copy = y_train X_test_copy = X_test y_test_copy = y_test # train and compute accuracy of final model trained on selected features final_list = [] for num_fea in range(30, 0, -1): # load the copies of the original data X_train = X_train_copy
def fisher_score_FS(X_train, y_train): score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) return (idx, score)
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1): """ AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats) inputFile: the .csv file containt feature tables groups: The selected groups to classify. Full set is ["S","F","Z","N","O"], but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG) FeatSelect: feature selection method: PCA, RFE, fisher or none Nfeats: number of selected features Returns: AllStatsMean: mean performance values AllStatsSTD: standard deviation of performance values """ #reads input features dfFeats = pd.read_csv(inputFile, sep=',', header=0) #only selected groups dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)] if "decTaime" in dfFeats: x = dfFeats.iloc[:, 2:] #ignores decomposition method execution time else: x = dfFeats.iloc[:, 1:] y = dfFeats.iloc[:, 0].values if scaleFeats: #scale feats? x = StandardScaler().fit_transform(x) #Feature selection if x.shape[1] > Nfeats: #RFE if FeatSelect == "RFE": rfeModel = SVC(kernel="linear", C=0.025, probability=True, gamma='scale') rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats) rfe_fit = rfeSelect.fit(x, y) x = x[:, rfe_fit.support_] if FeatSelect == "PCA": pca = PCA(n_components=Nfeats) x = pca.fit_transform(x) if FeatSelect == "fisher": fisherScore = fisher_score.fisher_score(x, y) idx = fisher_score.feature_ranking(fisherScore) x = x[:, idx[:Nfeats]] names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True, gamma='scale'), SVC(probability=True, gamma='scale'), GaussianProcessClassifier(1.0 * RBF(1.0)), MLPClassifier(alpha=1, max_iter=200) ] #initialize performance variable AllStats = {} AllStatsMean = {} AllStatsSTD = {} for name in names: AllStats[name] = { "Accuracy": np.zeros([realizations, K_folds]), "SensitivityMean": np.zeros([realizations, K_folds]), "SpecificityMean": np.zeros([realizations, K_folds]), "AUC_Mean": np.zeros([realizations, K_folds]), "SensitivityIctal": np.zeros([realizations, K_folds]), "SpecificityIctal": np.zeros([realizations, K_folds]), "AUC_Ictal": np.zeros([realizations, K_folds]), "TTtimes": np.zeros([realizations, K_folds]) } AllStatsMean[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } AllStatsSTD[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } #for each realization for i in range(realizations): skf = StratifiedKFold(n_splits=K_folds, shuffle=True) #5-fold validation for tupTemp, ki in zip(skf.split(x, y), range(K_folds)): train_idx, test_idx = tupTemp[0], tupTemp[1] X_train, X_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] for name, clf in zip(names, classifiers): #for each classifier tic = time.time( ) #check training/testing time of each classifier #Fit model and predict modelFit = clf.fit(X_train, y_train) yPredicted = modelFit.predict(X_test) probsTest = modelFit.predict_proba(X_test) toc = time.time() # AUC - #ictal class as positive if len(np.unique(y)) > 2: AUCs = roc_auc_score( LabelBinarizer().fit_transform(y_test), probsTest, average=None) else: AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None) #Sensitivity and Specificity cMatrix = confusion_matrix(y_test, yPredicted) FP = cMatrix.sum(axis=0) - np.diag(cMatrix) FN = cMatrix.sum(axis=1) - np.diag(cMatrix) TP = np.diag(cMatrix) TN = cMatrix.sum() - (FP + FN + TP) # Sensitivity TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) #fill performance variable AllStats[name]["Accuracy"][i, ki] = accuracy_score( y_test, yPredicted) AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR) AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR) AllStats[name]["SensitivityIctal"][i, ki] = TPR[0] AllStats[name]["SpecificityIctal"][i, ki] = TNR[0] AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs) AllStats[name]["TTtimes"][i, ki] = toc - tic if len(np.unique(y)) > 2: AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0] AllStatsDF = [0] * len(names) for idx, name in enumerate(names): for istat in AllStats[name].keys(): AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1) AllStatsMean[name][istat] = np.mean(AllStats[name][istat]) AllStatsSTD[name][istat] = np.std(AllStats[name][istat]) AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name]) AllStatsDF[idx]["Nmodes"] = Nmodes AllStatsDF[idx]["Classifier"] = name return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict( AllStatsSTD), pd.concat(AllStatsDF)
idx_rel = reliefF.feature_ranking(score_rel) #Laplacian score kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "k": 7, 't': 1, 'reliefF': True } W = construct_W.construct_W(X_train, **kwargs_W) score_lap = lap_score.lap_score(X_train, W=W) idx_lap = lap_score.feature_ranking(score_lap) #Fisher score_fish = fisher_score.fisher_score(X_train, y_train) print(score_fish) idx_fish = fisher_score.feature_ranking(score_fish) ###################################### Feature Integration idxM = idx_rel[:threshold] idxN = idx_lap[:threshold] idxO = idx_fish[:threshold] if combination_method == 1: #AND idx_and = reduce(np.intersect1d, (idxO, idxM, idxN)) idx = idx_and print("number of selectes features (bins) = ", idx.shape[0]) if combination_method == 2: #OR idx = np.concatenate((idxM, idxN, idxO)) idx = np.unique(idx)
def fit(self, X, y): idx = [] if self.tp == 'ITB': if self.name == 'MRMR': idx = MRMR.mrmr(X, y, n_selected_features=self.params['num_feats']) elif self.tp == 'filter': if self.name == 'Relief': score = reliefF.reliefF(X, y, k=self.params['k']) idx = reliefF.feature_ranking(score) if self.name == 'Fisher': # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) if self.name == 'MI': idx = np.argsort( mutual_info_classif( X, y, n_neighbors=self.params['n_neighbors']))[::-1] elif self.tp == 'wrapper': model_fit = self.model.fit(X, y) model = SelectFromModel(model_fit, prefit=True) idx = model.get_support(indices=True) elif self.tp == 'SLB': # one-hot-encode on target y = construct_label_matrix(y) if self.name == 'SMBA': scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'], verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'], affine=self.params['affine'], normalize=self.params['normalize'], step=self.params['step'], PCA=self.params['PCA'], GPU=self.params['GPU'], device=self.params['device']) nrmInd, sInd, repInd, _ = scba.admm() if self.params['type_indices'] == 'nrmInd': idx = nrmInd elif self.params['type_indices'] == 'repInd': idx = repInd else: idx = sInd if self.name == 'RFS': W = RFS.rfs(X, y, gamma=self.params['gamma']) idx = feature_ranking(W) if self.name == 'll_l21': # obtain the feature weight matrix W, _, _ = ll_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'ls_l21': # obtain the feature weight matrix W, _, _ = ls_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'LASSO': LASSO = Lasso(alpha=self.params['alpha'], positive=True) y_pred_lasso = LASSO.fit(X, y) if y_pred_lasso.coef_.ndim == 1: coeff = y_pred_lasso.coef_ else: coeff = np.asarray(y_pred_lasso.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'EN': # elastic net L1 enet = ElasticNet(alpha=self.params['alpha'], l1_ratio=1, positive=True) y_pred_enet = enet.fit(X, y) if y_pred_enet.coef_.ndim == 1: coeff = y_pred_enet.coef_ else: coeff = np.asarray(y_pred_enet.coef_[0, :]) idx = np.argsort(-coeff) return idx
"neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs_W) score = lap_score.lap_score(X_train, W=W) idx = lap_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # fisher_score score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # chi_square score = chi_square.chi_square(np.abs(X_train), y_train)
def Fisher_Score(self): score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score)
def fisher(train, test, K): score = fisher_score.fisher_score(train[0], train[1]) indices = fisher_score.feature_ranking(score)[:K] return indices