class UnivariateSelectChiFPRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR') self.id = 27 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
class f_regressionFPRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR') self.id = 29 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def multisplit(skf, X, y, stepsize=1000): total_score = 0 for train_index, test_index in skf: wl = [] pred1 = np.matrix([]) # Training for x in range(0, len(X[0]), stepsize): clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index]) tmp_p = np.matrix( clf1.decision_function(X[train_index, x:x + stepsize])) if pred1.size == 0: pred1 = tmp_p else: pred1 = np.concatenate((pred1, tmp_p), axis=1) wl.append(clf1) #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index]) selectf = SelectFpr().fit(pred1, y[train_index]) clf3 = AdaBoostClassifier(n_estimators=100) #clf3 = svm.SVC(class_weight='auto') #clf3 = RandomForestClassifier(n_estimators=20) clf3.fit(selectf.transform(pred1), y[train_index]) # Testing predtest = np.matrix([]) k = 0 for x in range(0, len(X[0]), stepsize): tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize])) if predtest.size == 0: predtest = tmp_p else: predtest = np.concatenate((predtest, tmp_p), axis=1) k += 1 # Final prediction predfinal = clf3.predict(selectf.transform(predtest)) print "Target : ", y[test_index] print "Prediction : ", predfinal matchs = np.equal(predfinal, y[test_index]) score = np.divide(np.sum(matchs), np.float64(matchs.size)) total_score = score + total_score return np.divide(total_score, skf.n_folds)
def multisplit(skf,X,y,stepsize=1000): total_score = 0 for train_index, test_index in skf: wl = [] pred1 = np.matrix([]) # Training for x in range(0, len(X[0]), stepsize): clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index]) tmp_p = np.matrix(clf1.decision_function(X[train_index, x:x + stepsize])) if pred1.size == 0: pred1 = tmp_p else: pred1 = np.concatenate((pred1, tmp_p), axis=1) wl.append(clf1) #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index]) selectf = SelectFpr().fit(pred1, y[train_index]) clf3 = AdaBoostClassifier(n_estimators=100) #clf3 = svm.SVC(class_weight='auto') #clf3 = RandomForestClassifier(n_estimators=20) clf3.fit(selectf.transform(pred1), y[train_index]) # Testing predtest = np.matrix([]) k = 0 for x in range(0, len(X[0]), stepsize): tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize])) if predtest.size == 0: predtest = tmp_p else: predtest = np.concatenate((predtest, tmp_p), axis=1) k += 1 # Final prediction predfinal = clf3.predict(selectf.transform(predtest)) print "Target : ", y[test_index] print "Prediction : ", predfinal matchs = np.equal(predfinal, y[test_index]) score = np.divide(np.sum(matchs), np.float64(matchs.size)) total_score = score + total_score return np.divide(total_score, skf.n_folds)
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def feature_Univarselection(data, y, Alpha): xx = data.sort_values('pid').values xx_label = y.sort_values('pid')[sep].values select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label) # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label) # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y) reduced_xx = select.transform(xx) new_data = select.inverse_transform(reduced_xx) new_data = pd.DataFrame(new_data, index=data.sort_values('pid').index, columns=data.sort_values('pid').columns) # idx = select.get_support() # print(idx) # new_data = np.delete(new_data,idx,1) return new_data
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
class ExamDropExtractor: """ The Exam Drop Extractor deals with obtaining the train data and predict data for the Exam Layer. For this we use the following techniques: polynomial transformations, local outlier removal, zero variance removal, ANOVA F filter and PCA. Furthermore we resample the data such that every episode has the same likelihood of being picked. """ BIN_STRATEGY = 'kmeans' # The method used to select the splits between the bins. def __init__(self, predict_season: int, predict_episode: int, train_seasons: Set[int], anova_f_significance: float, pca_explain: float, max_splits: int): """ Constructor of the Exam Drop Extractor Arguments: predict_season (int): The season for which we make the prediction. predict_episode (int): The latest episode in the predict season that could be used. train_seasons (Set[int]): The seasons which are used as train data. anova_f_significance (float): Only features with a p-value lower than this value will be selected by the ANOVA F filter. pca_explain (float): PCA will select the least number of components that at least explain this amount of variance in the features. max_splits (int): How many additional bins should be used to discretize the features. """ self.__predict_season = predict_season self.__predict_episode = predict_episode self.__train_seasons = train_seasons self.__anova_f_significance = anova_f_significance self.__pca_explain = pca_explain self.__max_splits = max_splits def get_train_data(self) -> Tuple[np.array, np.array, np.array]: """ Get the formatted and sampled train data with train weights useable for machine learning algorithms. Returns: The train input, train output and train weights in this order. The train input is a 2d array where each row represents a different train element. The train output is 1d array of labels, such that the ith row of the train input corresponds to the ith element of the train output. """ train_data = [] for season in self.__train_seasons: train_data.extend(self.__get_season_data(season, sys.maxsize, True)) train_input = np.array([ExamDropEncoder.extract_features(sample, sys.maxsize) for sample in train_data]) train_output = np.array([1.0 if get_is_mol(sample.selected_player) else 0.0 for sample in train_data]) num_bins = self.get_num_bins(train_input, self.__max_splits) self.__discretizer = KBinsDiscretizer(n_bins = num_bins, encode = "onehot-dense", strategy = ExamDropExtractor.BIN_STRATEGY) train_input = self.__discretizer.fit_transform(train_input) train_input = self.__add_answered_on_feature(train_data, train_input) self.__anova_f_filter = SelectFpr(f_classif, alpha = self.__anova_f_significance) train_input = self.__anova_f_filter.fit_transform(train_input, train_output) self.__pca = PCA(n_components = self.__pca_explain) train_input = self.__pca.fit_transform(train_input) return train_input, train_output, self.__get_train_weights(train_data) def get_predict_data(self) -> List[PredictSample]: """ Get all formatted predict data useable for the machine learning algorithms to do a prediction. Returns: A list of prediction samples, where a prediction sample consists of a set of players included in the answer and not included in the answer. Also a prediction sample consist of the features for the participants included in the answer and not included in the answer. """ predict_data = self.__get_season_data(self.__predict_season, self.__predict_episode, False) if not predict_data: return [] predict_input = np.array([ExamDropEncoder.extract_features(sample, self.__predict_episode) for sample in predict_data]) predict_input = self.__discretizer.transform(predict_input) predict_input = self.__add_answered_on_feature(predict_data, predict_input) predict_input = self.__anova_f_filter.transform(predict_input) predict_input = self.__pca.transform(predict_input) predict_samples = [] weights = self.__get_train_weights(predict_data) for data, in_features, out_features, weight in zip(predict_data[::2], predict_input[1::2], predict_input[::2], weights): in_answer = data.answer out_answer = set(data.exam_episode.players).difference(data.answer) predict_samples.append(PredictSample(in_answer, out_answer, in_features, out_features, weight)) return predict_samples @staticmethod def __get_season_data(season_num: int, max_episode: int, training_data: bool) -> List[TrainSample]: """ Get all raw answer data from a season. Arguments: season_num (int): The season from which we obtain this data. max_episode (int): The latest episode which can still be extracted. If this value is sys.maxsize then all raw answer data is obtained from this season. training_data (bool): True if the data is used as training data and false if the data is used as prediction data. The difference is that in case it is used for predictions we use a bool value as selected_player and otherwise selected_player is a Player. Returns: A list of prediction samples, where a prediction sample consists of a set of players included in the answer and not included in the answer. Also a prediction sample consist of the features for the participants included in the answer and not included in the answer. """ season = EXAM_DATA[season_num] drop_players = season.get_drop_mapping(DropType.EXECUTION_DROP, max_episode) all_answers = season.get_all_answers(set(drop_players.keys()), max_episode) season_data = [] for answer in all_answers: exam_episode = answer.episode drop_episodes = drop_players[answer.player] drop_episodes = [episode for episode in drop_episodes if exam_episode <= episode] if drop_episodes: if training_data: for player in exam_episode.players: season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode, answer.question, answer.answer, player)) else: for answer_on in [False, True]: season_data.append(TrainSample(answer.player, season_num, min(drop_episodes), answer.episode, answer.question, answer.answer, answer_on)) return season_data @staticmethod def get_num_bins(train_input: np.array, max_splits: int) -> List[int]: """ Get the number of bins for all features. To determine this we use a forward stepwise information gain algorithm, which gives the feature with the highest entropy increase an additional bin. Arguments: train_input (np.array): All non-transformed train input. max_splits (int): How many times an additional bin should be added. Returns: A list of integers, which represent the number of bins for each feature. """ num_bins = [2 for _ in train_input[0]] max_bins = [len(set(column)) for column in train_input.T] entropies = [ExamDropExtractor.__entropy(np.expand_dims(column, axis = 1), 2) for column in train_input.T] options = PriorityQueue() for i, data in enumerate(train_input.T): if max_bins[i] > 2: data = np.expand_dims(data, axis = 1) new_entropy = ExamDropExtractor.__entropy(data, 3) options.put((-(new_entropy - entropies[i]), i)) for _ in range(max_splits): if options.empty(): break entropy, i = options.get() num_bins[i] = num_bins[i] + 1 entropies[i] = entropies[i] - entropy if num_bins[i] != max_bins[i]: data = np.expand_dims(train_input[:, i], axis = 1) new_entropy = ExamDropExtractor.__entropy(data, num_bins[i] + 1) options.put((-(new_entropy - entropies[i]), i)) return num_bins @staticmethod def __entropy(data: np.array, bins: int) -> float: """ Compute the entropy of a KBinsDiscretizer split using a certain number of bins. Arguments: data (np.array): A list of all values for a particular feature. bins (int): In how many bins the values should be split. Returns: The entropy of this split. """ discretizer = KBinsDiscretizer(n_bins = bins, encode = "onehot-dense", strategy = ExamDropExtractor.BIN_STRATEGY) data = discretizer.fit_transform(data) new_entropy = np.sum(data, axis = 0) return sc.stats.entropy(new_entropy / np.sum(new_entropy)) @staticmethod def __add_answered_on_feature(samples: List[TrainSample], all_features: np.array) -> np.array: """ Translate the features such that answered_on is also included as feature which represents whether the player is included in the answer or not. This translation adds a new feature which is 1 if the player is in the answer and 0 if not. Also it adds all features multiplied by this answered on features. Arguments: samples (List[TrainSample]): All raw data corresponding to each row in all_features. This data is used to check if a player was included in the answer or not. all_features (np.array): All feature values so far. Returns: All new feature values translated by adding the answered_on feature to it. """ new_features = [] for sample, features in zip(samples, all_features): if isinstance(sample.selected_player, bool): answered_on = 1.0 if sample.selected_player else 0.0 else: answered_on = 1.0 if sample.selected_player in sample.answer else 0.0 features = np.append(features, answered_on * features) features = np.append(features, [answered_on]) new_features.append(features) return np.array(new_features) @staticmethod def __get_train_weights(train_data: List[TrainSample]) -> np.array: """ Get the weight for the training data, which is 1 / num_answers where num_answers is the number of answers given by that player in that episode. Arguments: train_data (List[TrainSample]): All raw train data from which it is extracted to which episode an answer belongs. Returns: An 1d array of weights which pairwise corresponds to the train_data (and therefore pairwise corresponds with each row in the train input). """ train_weights = [] for sample in train_data: num_answers = sample.exam_episode.get_all_answers({sample.player}, sys.maxsize) train_weights.append(1 / len(num_answers)) return np.array(train_weights)
print "SelectPercentile -- chi2" print X_fitted_4.scores_ print X_fitted_4.pvalues_ print X_fitted_4.get_support() X_transformed_4 = X_fitted_4.transform(X) print X_transformed_4.shape #SelectFpr --- chi2 from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import chi2 X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y) print "SelectFpr --- chi2" print X_fitted_5.scores_ print X_fitted_5.pvalues_ print X_fitted_5.get_support() X_transformed_5 = X_fitted_5.transform(X) print X_transformed_5.shape #SelectFpr --- f_classif from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import f_classif X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y) print "SelectFpr --- f_classif" print X_fitted_6.scores_ print X_fitted_6.pvalues_ print X_fitted_6.get_support() X_transformed_6 = X_fitted_6.transform(X) print X_transformed_6.shape # SelectFdr 和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是 #统计检验方法,从上面可以看到,使用f_classif的得出的参数都相同。
selector.fit(features, labels) print 'performing 6-fold cross-validation' kf = KFold(len(features), 6, shuffle=False, random_state=None) roc_scores = [] for train_indices, test_indices in kf: X_train, X_test = [ features[train_index] for train_index in train_indices ], [features[test_index] for test_index in test_indices] y_train, y_test = [ labels[train_index] for train_index in train_indices ], [labels[test_index] for test_index in test_indices] test_model = LogisticRegression() X_train = selector.transform(X_train) X_test = selector.transform(X_test) test_model.fit(X_train, y_train) y_predicted = test_model.predict(X_test) predict_probabilities = test_model.predict_proba(X_test) positive_probabilities = [ predict_probability[1] for predict_probability in predict_probabilities ] roc_scores.append(roc_auc_score(y_test, positive_probabilities)) print "Features left: " + str(len(X_train[0])) + " out of " + str( len(features[0])) print "ROC auc score: " + str(sum(roc_scores) / float(len(roc_scores))) print ""
print X_fitted_4.scores_ print X_fitted_4.pvalues_ print X_fitted_4.get_support() X_transformed_4 = X_fitted_4.transform(X) print X_transformed_4.shape #SelectFpr --- chi2 from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import chi2 X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X, y) print "SelectFpr --- chi2" print X_fitted_5.scores_ print X_fitted_5.pvalues_ print X_fitted_5.get_support() X_transformed_5 = X_fitted_5.transform(X) print X_transformed_5.shape #SelectFpr --- f_classif from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import f_classif X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31).fit(X, y) print "SelectFpr --- f_classif" print X_fitted_6.scores_ print X_fitted_6.pvalues_ print X_fitted_6.get_support() X_transformed_6 = X_fitted_6.transform(X) print X_transformed_6.shape # SelectFdr 和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是