def recursive_feature_elimination(input_data, feature_names, estimator=SVC(kernel='linear'), n_features_to_select=None, step=0.1): """ Recursively eliminates features from x_train and x_test using scikit-learn's RFE, see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before feature selection or None. estimator (object): Passed to RFE, see documentation n_features_to_select (int or None): Passed to RFE, see documentation step (int or float): Passed to RFE, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_Args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFE(estimator, n_features_to_select, step) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = { 'estimator': estimator, 'n_features_to_select': n_features_to_select, 'step': step } return output_data, feature_names, args
def neural_network(input_data, feature_names=None, validate=True): """ Constructs, compiles, trains and tests/makes predictions with a neural network. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): Ignored, here for compatability. validate (bool): If True, an accuracy is returned, if False a list of predictions for x_test is returned. Returns: float: model accuracy or list: predicted classifications for x_test. """ feature_names = None x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] all_labels = np.concatenate((y_train, y_test)) unique_labels = np.unique(all_labels) num_classes = unique_labels.shape[0] y_train = to_categorical(y_train, num_classes=num_classes) if validate: y_test = to_categorical(y_test, num_classes=num_classes) if len(x_train.shape) == 2: x_train = make3D(x_train) x_test = make3D(x_test) model = Sequential() model.add( Conv1D(filters=10, kernel_size=3, activation='relu', input_shape=(x_train.shape[1], 1))) model.add(Flatten()) model.add(Dense(num_classes, activation='softmax')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, epochs=50, batch_size=10, verbose=0) if validate: evaluation = model.evaluate(x_test, y_test, batch_size=1, verbose=0) output = evaluation[1] else: output = model.predict(x_test) return (output, feature_names)
def select_percentile(input_data, feature_names, score_func=chi2, percentile=5): """ Selects the percentile best features in x_train, removes the rest of the features from x_train and x_test. Selects the best features by using the score function score_func and scikit-learn's SelectPercentile. If feature_names is given it is also returned with any features removed from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before selection or None. score_func (function): The score function to be passed to SelectKBest percentile (int): Percentile of features to keep. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = SelectPercentile(score_func=score_func, percentile=percentile) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] return output_data, feature_names, { 'score_func': score_func, 'percentile': percentile }
def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def recursive_feature_elimination_cv(input_data, feature_names, step=0.1, cv=3, estimator=SVC(kernel='linear')): """ Recursively elinates features from x_train and x_test with cross validation, uses scikit-learn's RFECV see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names: The names of all features before feature selection or None. estimator (object): Passed to RFECV, see documentation step (int or float): Passed to RFECV, see documentation cv (int): Passed to RFECV, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFECV(estimator, step, cv) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = {'step': step, 'cv': cv, 'estimator': estimator} return output_data, feature_names, args
def variance_threshold(input_data, feature_names, threshold=0.16): """ Removes all features from x_train and x_test whose variances in x_train is less than threshold. Uses scikit-learn's VarianceThreshold If feature_names is given it is also returned with any features removed from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before selection or None. threshold (float): Lower limit of variance for a feature to be kept Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = VarianceThreshold(threshold=threshold) x_train = feature_selector.fit_transform(x_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] return output_data, feature_names, {'threshold': threshold}
def setUp(self): self.input = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]) self.output = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) self.flat = flatten(self.input) self.threeD = make3D(self.output)