예제 #1
0
def support_vector_machine(input_data,
                           kernel='linear',
                           C=1,
                           feature_names=None,
                           validate=True):
    """
    Fits, trains, and tests/makes predictions with a support vector machine.

    Args:
        input_data (tuple):     x_train, y_train, x_test
        kernel (str):           The kernel to be used by the SVM
        C (int or float):       The regularization parameter for the SVM
        feature_names (list):   The names of every feature in input_data, if
                                given, a sorted [high to low] list of the most
                                important features used to make predictions is
                                also returned.
        validate (bool):        If True a model accuracy is returned, if False
                                a list of predicted classifications for x_test
                                is returned.

    Returns:
        list: Predicted classifications for each x_test
        or
        float: Model accuracy
        or
        tuple: accuracy/predictions, features ranked by importance
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    if len(x_train.shape) == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    model = svm.SVC(kernel=kernel, C=C)
    model.fit(x_train, y_train)
    if validate:
        output_data = model.score(x_test, y_test)
    else:
        output_data = model.predict(x_test)

    if feature_names is not None:
        coefs = model.coef_
        if coefs.ndim > 1:
            coefs = coefs.sum(axis=0)
        coefs = coefs.ravel()
        absolute_coefs = np.absolute(coefs)
        absolute_coefs = [float(x) for x in absolute_coefs]
        feature_names = [convert_well_index(x) for x in feature_names]
        features_coefs = dict(list(zip(feature_names, absolute_coefs)))
        output = (output_data, features_coefs)
    else:
        output = (output_data, None)

    return output
예제 #2
0
def recursive_feature_elimination(input_data,
                                  feature_names,
                                  estimator=SVC(kernel='linear'),
                                  n_features_to_select=None,
                                  step=0.1):
    """
    Recursively eliminates features from x_train and x_test using
    scikit-learn's RFE, see documentation:
    http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
    If feature_names is given it is also returned with any features from
    x_train and x_test also removed from feature_names.

    Args:
        input_data (tuple):                   x_train, y_train, x_test, y_test
        feature_names (list):                 The names of all features before
                                              feature selection or None.
        estimator (object):                   Passed to RFE, see documentation
        n_features_to_select (int or None):   Passed to RFE, see documentation
        step (int or float):                  Passed to RFE, see documentation

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_Args
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = RFE(estimator, n_features_to_select, step)
    x_train = feature_selector.fit_transform(x_train, y_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    args = {
        'estimator': estimator,
        'n_features_to_select': n_features_to_select,
        'step': step
    }

    return output_data, feature_names, args
예제 #3
0
def select_percentile(input_data,
                      feature_names,
                      score_func=chi2,
                      percentile=5):
    """
    Selects the percentile best features in x_train, removes the rest of the
    features from x_train and x_test. Selects the best features by using the
    score function score_func and scikit-learn's SelectPercentile. If
    feature_names is given it is also returned with any features removed from
    x_train and x_test also removed from feature_names.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        feature_names (list):   The names of all features before selection or
                                None.
        score_func (function):  The score function to be passed to SelectKBest
        percentile (int):       Percentile of features to keep.

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_args
    """
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = SelectPercentile(score_func=score_func,
                                        percentile=percentile)
    x_train = feature_selector.fit_transform(x_train, y_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    return output_data, feature_names, {
        'score_func': score_func,
        'percentile': percentile
    }
예제 #4
0
def select_fdr(input_data,
               feature_names=None,
               score_func=f_classif,
               alpha=0.05):
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    done = False
    increment = alpha
    while not done:
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
        temp_x_train = feature_selector.fit_transform(x_train, y_train)
        temp_x_test = feature_selector.transform(x_test)
        if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1:
            done = True
            x_train = temp_x_train
            x_test = temp_x_test
        else:
            msg = 'Feature selection was too aggresive, '
            msg += 'increasing alpha from {} to {}'.format(
                alpha, alpha + increment)
            alpha += increment
            logging.warning(msg)

    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)
    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    logging.info('Selected {} features'.format(x_train.shape[1]))

    final_args = {'score_func': score_func, 'alpha': alpha}

    return output_data, feature_names, final_args
예제 #5
0
def recursive_feature_elimination_cv(input_data,
                                     feature_names,
                                     step=0.1,
                                     cv=3,
                                     estimator=SVC(kernel='linear')):
    """
    Recursively elinates features from x_train and x_test with cross
    validation, uses scikit-learn's RFECV see documentation:
    http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
    If feature_names is given it is also returned with any features from
    x_train and x_test also removed from feature_names.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        feature_names:          The names of all features before feature
                                selection or None.
        estimator (object):     Passed to RFECV, see documentation
        step (int or float):    Passed to RFECV, see documentation
        cv (int):               Passed to RFECV, see documentation

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_args
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = RFECV(estimator, step, cv)
    x_train = feature_selector.fit_transform(x_train, y_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    args = {'step': step, 'cv': cv, 'estimator': estimator}

    return output_data, feature_names, args
예제 #6
0
def f_test_threshold(input_data,
                     feature_names=None,
                     threshold=0.01,
                     increment=0.01,
                     min_keep=100):
    input_data, feature_names, _ = remove_constant(input_data, feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    F, pval = f_classif(x_train, y_train)

    while True:
        keep = np.where(pval <= threshold, True, False)
        new_x_train = x_train[:, keep]
        new_x_test = x_test[:, keep]
        if new_x_train.shape[1] >= min_keep:
            break
        else:
            threshold += increment
    logging.info('Selected {} features'.format(new_x_train.shape[1]))
    logging.info('Final p-value threshold: {}'.format(threshold))

    output_data = (new_x_train, y_train, new_x_test, y_test)

    if feature_names is not None:
        feature_names = feature_names[keep]

    args = {
        'threshold': threshold,
        'increment': increment,
        'min_keep': min_keep
    }

    return output_data, feature_names, args
예제 #7
0
def variance_threshold(input_data, feature_names, threshold=0.16):
    """
    Removes all features from x_train and x_test whose variances in x_train is
    less than threshold. Uses scikit-learn's VarianceThreshold If feature_names
    is given it is also returned with any features removed from x_train and
    x_test also removed from feature_names.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        feature_names (list):   The names of all features before selection or
                                None.
        threshold (float):      Lower limit of variance for a feature to be kept

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_args
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = VarianceThreshold(threshold=threshold)
    x_train = feature_selector.fit_transform(x_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    return output_data, feature_names, {'threshold': threshold}
예제 #8
0
def random_forest(input_data,
                  n_estimators=50,
                  feature_names=None,
                  validate=True):
    """
    Fits, trains, and tests/makes predictions with s a random forest
    classifier.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        n_estimators (int):     How many trees to use in the forest.
        feature_names (list):   The names of every feature in input_data, if
                                given, a sorted [high to low] list of the most
                                important features used to make predictions is
                                also returned.
        validate (bool):        If True a model accuracy is returned, if False
                                a list of predicted classifications for x_test
                                is returned.

    Returns:
        list: Predicted classifications for each x_test
        or
        float: Model accuracy
        or
        tuple: accuracy/predictions, features ranked by importance
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    if len(x_train.shape) == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    kwargs = {
        'n_estimators': n_estimators,
        'criterion': 'entropy',
        'max_features': 'log2',
        'max_depth': 100,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'min_weight_fraction_leaf': 0.01,
        'max_leaf_nodes': 25,
        'min_impurity_decrease': 0.001,
        'bootstrap': False,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**kwargs)
    model.fit(x_train, y_train)
    if validate:
        output_data = model.score(x_test, y_test)
    else:
        output_data = model.predict(x_test)

    if feature_names is not None:
        importances = model.feature_importances_.ravel()
        importances = [float(x) for x in importances]
        feature_names = [convert_well_index(x) for x in feature_names]
        features_importances = dict(list(zip(feature_names, importances)))
        output = (output_data, features_importances)
    else:
        output = (output_data, None)
    return output
예제 #9
0
 def setUp(self):
     self.input = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8],
                                                               [9]]])
     self.output = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     self.flat = flatten(self.input)
     self.threeD = make3D(self.output)