def support_vector_machine(input_data, kernel='linear', C=1, feature_names=None, validate=True): """ Fits, trains, and tests/makes predictions with a support vector machine. Args: input_data (tuple): x_train, y_train, x_test kernel (str): The kernel to be used by the SVM C (int or float): The regularization parameter for the SVM feature_names (list): The names of every feature in input_data, if given, a sorted [high to low] list of the most important features used to make predictions is also returned. validate (bool): If True a model accuracy is returned, if False a list of predicted classifications for x_test is returned. Returns: list: Predicted classifications for each x_test or float: Model accuracy or tuple: accuracy/predictions, features ranked by importance """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] if len(x_train.shape) == 3: x_train = flatten(x_train) x_test = flatten(x_test) model = svm.SVC(kernel=kernel, C=C) model.fit(x_train, y_train) if validate: output_data = model.score(x_test, y_test) else: output_data = model.predict(x_test) if feature_names is not None: coefs = model.coef_ if coefs.ndim > 1: coefs = coefs.sum(axis=0) coefs = coefs.ravel() absolute_coefs = np.absolute(coefs) absolute_coefs = [float(x) for x in absolute_coefs] feature_names = [convert_well_index(x) for x in feature_names] features_coefs = dict(list(zip(feature_names, absolute_coefs))) output = (output_data, features_coefs) else: output = (output_data, None) return output
def recursive_feature_elimination(input_data, feature_names, estimator=SVC(kernel='linear'), n_features_to_select=None, step=0.1): """ Recursively eliminates features from x_train and x_test using scikit-learn's RFE, see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before feature selection or None. estimator (object): Passed to RFE, see documentation n_features_to_select (int or None): Passed to RFE, see documentation step (int or float): Passed to RFE, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_Args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFE(estimator, n_features_to_select, step) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = { 'estimator': estimator, 'n_features_to_select': n_features_to_select, 'step': step } return output_data, feature_names, args
def select_percentile(input_data, feature_names, score_func=chi2, percentile=5): """ Selects the percentile best features in x_train, removes the rest of the features from x_train and x_test. Selects the best features by using the score function score_func and scikit-learn's SelectPercentile. If feature_names is given it is also returned with any features removed from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before selection or None. score_func (function): The score function to be passed to SelectKBest percentile (int): Percentile of features to keep. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = SelectPercentile(score_func=score_func, percentile=percentile) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] return output_data, feature_names, { 'score_func': score_func, 'percentile': percentile }
def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def recursive_feature_elimination_cv(input_data, feature_names, step=0.1, cv=3, estimator=SVC(kernel='linear')): """ Recursively elinates features from x_train and x_test with cross validation, uses scikit-learn's RFECV see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names: The names of all features before feature selection or None. estimator (object): Passed to RFECV, see documentation step (int or float): Passed to RFECV, see documentation cv (int): Passed to RFECV, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFECV(estimator, step, cv) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = {'step': step, 'cv': cv, 'estimator': estimator} return output_data, feature_names, args
def f_test_threshold(input_data, feature_names=None, threshold=0.01, increment=0.01, min_keep=100): input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) F, pval = f_classif(x_train, y_train) while True: keep = np.where(pval <= threshold, True, False) new_x_train = x_train[:, keep] new_x_test = x_test[:, keep] if new_x_train.shape[1] >= min_keep: break else: threshold += increment logging.info('Selected {} features'.format(new_x_train.shape[1])) logging.info('Final p-value threshold: {}'.format(threshold)) output_data = (new_x_train, y_train, new_x_test, y_test) if feature_names is not None: feature_names = feature_names[keep] args = { 'threshold': threshold, 'increment': increment, 'min_keep': min_keep } return output_data, feature_names, args
def variance_threshold(input_data, feature_names, threshold=0.16): """ Removes all features from x_train and x_test whose variances in x_train is less than threshold. Uses scikit-learn's VarianceThreshold If feature_names is given it is also returned with any features removed from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names (list): The names of all features before selection or None. threshold (float): Lower limit of variance for a feature to be kept Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = VarianceThreshold(threshold=threshold) x_train = feature_selector.fit_transform(x_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] return output_data, feature_names, {'threshold': threshold}
def random_forest(input_data, n_estimators=50, feature_names=None, validate=True): """ Fits, trains, and tests/makes predictions with s a random forest classifier. Args: input_data (tuple): x_train, y_train, x_test, y_test n_estimators (int): How many trees to use in the forest. feature_names (list): The names of every feature in input_data, if given, a sorted [high to low] list of the most important features used to make predictions is also returned. validate (bool): If True a model accuracy is returned, if False a list of predicted classifications for x_test is returned. Returns: list: Predicted classifications for each x_test or float: Model accuracy or tuple: accuracy/predictions, features ranked by importance """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] if len(x_train.shape) == 3: x_train = flatten(x_train) x_test = flatten(x_test) kwargs = { 'n_estimators': n_estimators, 'criterion': 'entropy', 'max_features': 'log2', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.01, 'max_leaf_nodes': 25, 'min_impurity_decrease': 0.001, 'bootstrap': False, 'n_jobs': -1 } model = RandomForestClassifier(**kwargs) model.fit(x_train, y_train) if validate: output_data = model.score(x_test, y_test) else: output_data = model.predict(x_test) if feature_names is not None: importances = model.feature_importances_.ravel() importances = [float(x) for x in importances] feature_names = [convert_well_index(x) for x in feature_names] features_importances = dict(list(zip(feature_names, importances))) output = (output_data, features_importances) else: output = (output_data, None) return output
def setUp(self): self.input = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]) self.output = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) self.flat = flatten(self.input) self.threeD = make3D(self.output)