def drop_low_var_columns(train_X, test_df, threshold):

    select_features = feature_selection.VarianceThreshold(threshold=threshold)
    select_features.fit(train_X)
    cols_to_drop = train_X.columns[select_features.get_support() != True]
    train_X.drop(columns=cols_to_drop, inplace=True)
    test_df.drop(columns=cols_to_drop, inplace=True)
Exemplo n.º 2
0
def feature_select_low_var(dataframe,
                           vars_name=None,
                           threshold_=1,
                           ret_normal=False):
    '''
    Feature selection using low variance method to the normalized applied features in dataframe. 
    Those features with variance below threshold are eliminated.
    Parameters
    ----------
    dataframe = features-only dataframe (response variable excluded)
    vars_name = list/array of strings with each feature name
    threshold = '1' default. Beware data is normalized so 'threshold = 1' is tantamount to 1x std deviations.
    ret_normal = 'False' default. If 'False' dataframe returned is normalized, otherwise the original data is returned.
    '''

    # Dataframe column names automatic obtention unless provided:
    if vars_name == None:
        try:
            names = dataframe.columns
        except:
            names = vars_name
    else:
        names = vars_name
    # taking out non-numeric vars:
    type_list = map(lambda x: type(x), dataframe.iloc[0, :])
    type_idx = [i != str for i in type_list]
    df = dataframe.iloc[:, type_idx]
    # normalizing numeric variables to get rid of scale effect over variance:
    df_t = preprocessing.StandardScaler().fit_transform(df)
    # Low Variance variable selection:
    selection = fs.VarianceThreshold(threshold=threshold_).fit_transform(df_t)
    # Retrieve selected variable names:
    sel_fit = fs.VarianceThreshold(threshold=threshold_).fit(df_t)
    selec_vars = selec_vars = names[sel_fit.get_support()]

    for i in selection[0, :]:
        temp = i == df_t[0, :]
        selec_vars.append(names[temp][0])

    if ret_normal == True:
        df_sel = pd.DataFrame(selection, columns=selec_vars)
        print('Returned variables are normalized')
    else:
        df_sel = pd.DataFrame(dataframe[selec_vars], columns=selec_vars)
        print('Returned variables are original')

    return df_sel
Exemplo n.º 3
0
def get_lovvar_cols(df, threshold):
    selector = feature_selection.VarianceThreshold(threshold=threshold)
    selector.fit(df)
    
    cols = df.columns[~selector.get_support()]
    cols = [col for col in cols if col not in ["Field9", "Field8", 'Field11', 'Field12']]
    
    return cols
Exemplo n.º 4
0
def variance_threshold(arr0, threshold):
    matrix = np.array(arr0)
    temp = feature_selection.VarianceThreshold(threshold=threshold).fit(matrix)
    scores = [np.var(el) for el in matrix.T]
    indx = temp.get_support().tolist()
    # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index)
    result = temp.transform(matrix).tolist()
    return scores, indx, result
Exemplo n.º 5
0
def stand():
    """
    归一化处理
    :return: none
    """
    value = [[10, 30, 80], [0, 0.5, 1], [1, 1, 2]]
    std = feature_selection.VarianceThreshold(200)
    print(std.fit_transform(value))
def removeZeroVarianceFeatures(X):
    '''Function to remove features which use zero variance
	Parameters: X (np.array): Features for the dataset
	Return: X (np.array): Modified array of features
	with no zero-variance features
	'''
    varianceSelector = feature_selection.VarianceThreshold()
    X = varianceSelector.fit_transform(X)
    return X
Exemplo n.º 7
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
Exemplo n.º 8
0
    def near_zero_var_df_sklearn(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_variance: float = 0.05,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected near-zero-variance features (Scikit algorithm).
        Feature selector that removes all low-variance features.
        This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be
        used for unsupervised learning.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_variance: Features with a training-set variance lower than this threshold will be removed.
        The default is to keep all features with non-zero variance, i.e. remove the features that have the same
        value in all samples.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with near-zero-variance (if applicable), using Scikit algorithm."
        )
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        indices = OrderedDict()
        summaries = OrderedDict()

        # find indices
        for label in df.columns.values():
            indices[df.columns.get_loc(label)] = label

        # search
        if to_search is True:
            variances_ = feature_selection.VarianceThreshold(thresh_variance)
            matches_indices = variances_.get_support(indices=True)
            matches_labels = [indices[index] for index in matches_indices]
            for match in matches_labels:
                if match not in excludes:
                    matches += [match]

        # delete
        df = self.__remove(
            df, {'NZV': list(matches)}, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
        return df, summaries
def remove_low_variance(data):
    #https://scikit-learn.org/stable/modules/feature_selection.html
    #X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
    #if features have low variance in 40% of samples, trim
    sel = feature_selection.VarianceThreshold(threshold=(1 - 0.05) * 0.05)
    new_dataset = sel.fit_transform(data)
    selected_feature_indicies = sel.get_support(indices=True)
    print(len(selected_feature_indicies))
    print(selected_feature_indicies)
    np.save("list of chosen features_05", selected_feature_indicies)
    np.save("low_variance_dataset_alpha05_new", new_dataset)
    return new_dataset
Exemplo n.º 10
0
def lars():
    behavior_data, conn_data = pu.load_data_full_subjects()
    conn_data.astype(float)

    categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex']
    categorical_data = behavior_data[categorical_variables]
    dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
    covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1)

    ml_data = pd.concat([conn_data, covariate_data], axis=1)
    target = behavior_data['distress_TQ'].values.astype(float)

    feature_names = list(ml_data)
    continuous_features = [f for f in feature_names if 'categorical' not in f]
    continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features]

    categorical_features = [f for f in feature_names if 'categorical' in f]
    categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features]

    ml_continuous = ml_data.values[:, continuous_indices]
    ml_categorical = ml_data.values[:, categorical_indices]

    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(ml_continuous)
    ml_z = preproc.transform(ml_continuous)

    # Variance threshold for categorical data
    varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical)
    ml_v = varthresh.transform(ml_categorical)

    ml_preprocessed = np.hstack((ml_z, ml_v))

    # Feature selection with extra trees
    clf = ensemble.ExtraTreesRegressor()
    model = feature_selection.SelectFromModel(clf, threshold="2*mean")
    # Transform train and test data with feature selection model
    ml_cleaned = model.fit_transform(ml_preprocessed, target)
    feature_indices = model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False)

    lars_classifier.fit(ml_cleaned, target)
    predicted = lars_classifier.predict(ml_cleaned)

    r2 = lars_classifier.score(ml_cleaned, target)

    exp_var = metrics.explained_variance_score(target, predicted)
    max_err = metrics.max_error(target, predicted)
    mae = metrics.mean_absolute_error(target, predicted)
    mse = metrics.mean_squared_error(target, predicted)
    print(r2)
Exemplo n.º 11
0
def variance_threshold(data, alpha=0.05):
    """A wrapper of scikit-learn VarianceThreshold."""

    X_train, X_test, y_train, y_test = data

    # Z-scores.
    X_train_std, X_test_std = utils.train_test_z_scores(X_train, X_test)

    selector = feature_selection.VarianceThreshold(threshold=alpha)
    # NB: Cannot filter variance from standardized data.
    selector.fit(X_train, y_train)
    support = _check_support(selector.get_support(indices=True), X_train_std)

    return _check_feature_subset(X_train_std, X_test_std, support)
Exemplo n.º 12
0
def key_features(X_train, y_train, sub, varience_test=True):
    print 'Features before reduction: ' + str(len(X_train[0]))
    if varience_test:
        #remove features with low variance
        sel = feature_selection.VarianceThreshold(threshold=(.8 * (1 - .8)))
        X_train = sel.fit_transform(X_train)
        sub = sel.transform(sub)
        print 'Features after variance reduction: ' + str(len(X_train[0]))

    estimator = linear_model.SGDClassifier(n_jobs=-1, class_weight='auto')
    selector = feature_selection.RFECV(estimator, step=1, cv=5)
    features = selector.fit_transform(X_train, y_train)
    submission = selector.transform(sub)

    print 'Features after recursive elimination: ' + str(len(features[0]))

    return (features, submission)
Exemplo n.º 13
0
def clean_features(data, header, **kwargs):

    #extract parameters
    min_feature_variance = kwargs.get('min_feature_variance', .8 * (1 - .8))

    #remove features with variance below the threshold
    feature_selector = feature_selection.VarianceThreshold(
        threshold=min_feature_variance)
    reduced_data = feature_selector.fit_transform(data)

    #create a mask of features selected
    mask = feature_selector.get_support(indices=True)

    #select the same indexes from the header
    reduced_header = np.take(header, mask)

    return reduced_data, reduced_header
Exemplo n.º 14
0
def learning(X, y, estimator, search_params):

    scoring = dict(roc_auc='roc_auc',
                   pr_auc=metrics.make_scorer(pr_auc_score),
                   accuracy='accuracy',
                   balanced_accuracy='balanced_accuracy',
                   precision='precision',
                   recall='recall',
                   f1='f1',
                   mcc=metrics.make_scorer(metrics.matthews_corrcoef))

    pipe = Pipeline([
        ('var', feature_selection.VarianceThreshold(
            threshold=0)),  # To remove constant features
        ('bal',
         None),  # To balance the training data See search_params['bal'] below)
        ('pre',
         None),  # To scale (and center) data. See search_params['pre'] below
        ('clf', estimator)
    ])

    groups = X.group.tolist()
    X = X.drop(['group'], axis=1)

    search_params['bal'] = [
        None,
        RandomUnderSampler(sampling_strategy='majority', random_state=42),
        RandomOverSampler(sampling_strategy='minority', random_state=42)
    ]
    search_params['pre'] = [
        None,
        preprocessing.MinMaxScaler(),
        preprocessing.StandardScaler()
    ]

    search = model_selection.RandomizedSearchCV(pipe,
                                                search_params,
                                                cv=release_split(X, y, groups),
                                                scoring=scoring,
                                                refit='pr_auc',
                                                verbose=0)
    search.fit(X, y)

    return search.cv_results_, search.best_index_
Exemplo n.º 15
0
    def fit_transform(self, data):
        """Fit and transform using feature filtering.

        Fit and transform using several kind of feature filtering methods to
        select features in data.

        :param data: Dataframe. The Pandas dataframe, to be converted.

        :return: Dataframe. The converted dataframe after feature filtering.
        """
        # Removing features with low variance.
        threshold = 0.0
        var_thre = fe.VarianceThreshold(threshold=threshold)
        result = var_thre.fit_transform(data[data.columns.difference(
            [self.target_column])])
        feature_select = data.columns.difference([self.target_column
                                                  ])[var_thre.get_support()]
        result = pd.DataFrame(columns=feature_select, data=result)
        result[self.target_column] = data[self.target_column]
        # Store converter.
        self.variance_threshold = var_thre

        # Univariate feature selection, using univariate statistical tests.
        data = result
        univar_select = fe.GenericUnivariateSelect(
            score_func=fe.mutual_info_classif, mode='fwe', param=0.05)

        # Check whether it's regression or classification.
        # If classification, skip univariate.
        if len(data[self.target_column].value_counts()) <= 2:
            return result
        # If Regression.
        result = univar_select.fit_transform(
            data[data.columns.difference([self.target_column])],
            np.asarray(data[self.target_column]))
        feature_select = data.columns.difference(
            [self.target_column])[univar_select.get_support()]
        result = pd.DataFrame(columns=feature_select, data=result)
        result[self.target_column] = data[self.target_column]
        # Store converter.
        self.univar_select = univar_select

        return result
Exemplo n.º 16
0
def get_low_var_cols(df, threshold):
    """Analyse a Pandas DataFrame, extract the numeric columns and
    return a list of those which have a variance below the threshold.

    Args:
        :param df: Pandas DataFrame.
        :param threshold: Variance threshold.

    Returns:
        List of columns with variance below the threshold.

    Example:
        low_var_cols = get_low_var_cols(df, 0.01)

    """
    df = df.select_dtypes(['number'])
    selector = feature_selection.VarianceThreshold(threshold=threshold)
    selector.fit(df)
    return df.columns[~selector.get_support()]
Exemplo n.º 17
0
def fitness(data, metric='euclidean', k=10, seed=None):
    if metric == 'variance':
        sel = fs.VarianceThreshold()
        sel.fit(data)
        return np.average(np.array(sel.variances_))

    if seed < 0:
        random_seed = None
    else:
        random_seed = seed

    km = KMeans(n_clusters=k, random_state=random_seed, n_jobs=-1)
    labels = km.fit_predict(data)
    if metric == 'euclidean':
        return silhouette_score(data, labels)
    elif metric == 'cosine':
        return silhouette_score(data, labels, metric='cosine')
    else:
        return km.inertia_
Exemplo n.º 18
0
def get_features(X, y, fsm):
    if fsm == '1':
        # SelectKBest
        kBest = math.ceil(len(X[0]) / 2)
        feature_scores = f_selection.SelectKBest(chi2,
                                                 k=kBest).fit_transform(X, y)
        return feature_scores
    elif fsm == '2':
        # VarianceThreshold
        feature_scores = f_selection.VarianceThreshold(
            threshold=vThreshold).fit_transform(X)
        return feature_scores
    elif fsm == '3':
        # SelectFromModel
        clf = ExtraTreesClassifier(random_state=200)
        clf = clf.fit(X, y)
        model = f_selection.SelectFromModel(clf).fit(X, y)
        feature_scores = model.transform(X)
        return feature_scores
    else:
        raise ValueError("invalid fsm")
Exemplo n.º 19
0
    def get_model(self, resume=False):
        if not resume:
            if self.method == 'variance':  # Unsupervised
                p = .5
                selector = feature_selection.VarianceThreshold(
                    threshold=(p * (1 - p)))
            elif self.method == 'rfe':
                estimator = LogisticRegression()
                selector = feature_selection.RFE(
                    estimator,
                    n_features_to_select=self.feat_limit,
                    step=1,
                    verbose=0)
            elif self.method == 'forward':
                estimator = ExtraTreesClassifier(n_estimators=100)
                selector = SelectFromModel(estimator)
            elif self.method == 'seq_bwd':
                estimator = LogisticRegression(solver='lbfgs')
                selector = SFS(estimator,
                               k_features=self.feat_limit,
                               forward=False,
                               floating=False,
                               scoring='roc_auc',
                               cv=4,
                               n_jobs=-1)
            elif self.method == 'seq_fwd':
                estimator = LogisticRegression(solver='lbfgs')
                selector = SFS(estimator,
                               k_features=self.feat_limit,
                               forward=True,
                               floating=False,
                               scoring='roc_auc',
                               cv=4,
                               n_jobs=-1)
        else:
            selector = joblib.load(self.model_save_path)

        if self.verbose > 2:
            print(selector)
        return selector
Exemplo n.º 20
0
    def train(self, features):

        # Setup:
        start_time = time.time()

        # Check feature set:
        assert (np.isfinite(features).all())

        # Normalizer:
        if self.normalize:
            standardizer = preprocessing.StandardScaler()
            features = standardizer.fit_transform(features)
            self.normalizer = standardizer

        # Option 1 (Random Projection):
        if self.reducer_type == Reducers.random_projection:
            transformer = random_projection.GaussianRandomProjection()
            transformer.fit(features)
            self.reducer = transformer

        # Option 2 (Feature Selection):
        if self.reducer_type == Reducers.feature_selection:
            threshold = (self.explained_variance) * (1 -
                                                     self.explained_variance)
            selector = feature_selection.VarianceThreshold(threshold=threshold)
            selector.fit(features)
            self.reducer = selector

        # Option 3 (PCA):
        if self.reducer_type == Reducers.pca:
            pca = decomposition.PCA(n_components=self.explained_variance,
                                    svd_solver="full")
            pca.fit(features)
            self.reducer = pca

        # Calculate elapsed time:
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Training preprocessor took %.2f seconds" % elapsed_time)
Exemplo n.º 21
0
    def pre_process(X, X_pred, mode='full'):
             
        preprocessings = {
                'standards':preprocessing.StandardScaler(),
                'minmaxs':preprocessing.MinMaxScaler(),
                'robusts':preprocessing.RobustScaler(),
                'PCA': PCA(),
                'PowerTransformer':preprocessing.Normalizer(), 
                'variance_threshold':feature_selection.VarianceThreshold(threshold=0.5),        
                      }
        
        if mode == 'full':
            pipe_preprocessing = make_pipeline(
            #                        preprocessings['variance_threshold'],
                                     preprocessings['standards'],
<<<<<<< HEAD
                                     preprocessings['robusts'],
                                     preprocessings['PCA'],
                                     preprocessings['PowerTransformer'],
                                     preprocessings['minmaxs'],
=======
    #                                 preprocessings['minmaxs'],
                                     preprocessings['robusts'],
    #                                 preprocessings['PCA'],
                                     preprocessings['PowerTransformer'],                               
>>>>>>> bb38b4fdedf3f7cc3dab38fea065353aeef512fa
                                    )
        elif mode == 'min_max_scale':
            pipe_preprocessing = make_pipeline(
                    preprocessings['minmaxs'],
                    preprocessings['PCA'],
                    )
            
        full = np.concatenate((X, X_pred), axis=0)
        pipe_preprocessing.fit(full)
        X = pipe_preprocessing.transform(X)
        X_pred = pipe_preprocessing.transform(X_pred)
        
        return X, X_pred
def feature_selection_with_covariates(x_train, x_test, y_train,
                                      continuous_indices, categorical_indices,
                                      feature_names):
    # Split data for continuous, categorical preprocessing
    x_train_cont, x_test_cont = x_train[:,
                                        continuous_indices], x_test[:,
                                                                    continuous_indices]
    x_train_cat, x_test_cat = x_train[:,
                                      categorical_indices], x_test[:,
                                                                   categorical_indices]

    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(x_train_cont)
    x_train_z = preproc.transform(x_train_cont)
    x_test_z = preproc.transform(x_test_cont)

    # Variance threshold for categorical data
    varthresh = feature_selection.VarianceThreshold(
        threshold=0).fit(x_train_cat)
    x_train_v = varthresh.transform(x_train_cat)
    x_test_v = varthresh.transform(x_test_cat)

    x_train_data = np.hstack((x_train_z, x_train_v))
    x_test_data = np.hstack((x_test_z, x_test_v))

    # Feature selection with extra trees
    extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed)
    feature_model = feature_selection.SelectFromModel(extra_tree_fs,
                                                      threshold="2*mean")

    # Transform train and test data with feature selection model
    x_train_feature_selected = feature_model.fit_transform(
        x_train_data, y_train)
    x_test_feature_selected = feature_model.transform(x_test_data)
    feature_indices = feature_model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    return x_train_feature_selected, x_test_feature_selected, cleaned_features
Exemplo n.º 23
0
def clean_features(data, name):
    df = data[name]
    X_columns = df.columns

    # remove stellar classes
    flt = ~(("nbg" == X_columns) | X_columns.str.endswith("_id")
            | X_columns.str.contains("_scls_")
            | X_columns.str.endswith("AndersonDarling")
            | X_columns.str.endswith("StetsonJ")
            | X_columns.str.endswith("StetsonK"))

    X_columns = X_columns[flt]
    X_columns

    # remove signatures
    X_columns = X_columns[~X_columns.str.startswith("Signature_")]
    X_columns

    # columns with nan and null
    with_nulls = set()
    for df in data.values():
        for c in X_columns:
            if df[c].isnull().any():
                with_nulls.add(c)
    print("Removing {} because null".format(list(with_nulls)))
    X_columns = X_columns[~X_columns.isin(with_nulls)]

    # low variance
    df = pd.concat(data.values())
    y = df["nbg"].values

    vt = fs.VarianceThreshold()
    vt.fit(df[X_columns].values, y)
    print("Removing {} because lowvariance".format(
        list(X_columns[~vt.get_support()])))
    X_columns = X_columns[vt.get_support()]

    return X_columns
Exemplo n.º 24
0
    def fitness(self):
        data_projected = self.data[self.attributes]

        if self.metric == 'variance':
            sel = fs.VarianceThreshold()
            sel.fit(data_projected)
            return np.average(np.array(
                sel.variances_)) - np.log(1 + self.violations)

        km = KMeans(n_clusters=self.k,
                    random_state=self.random_seed,
                    n_jobs=-1)
        labels = km.fit_predict(data_projected)
        if self.metric == 'euclidean':
            return silhouette_score(data_projected,
                                    labels) - np.log(1 + self.violations)
        elif self.metric == 'cosine':
            return silhouette_score(
                data_projected, labels,
                metric=self.metric) - np.log(1 + self.violations)
        else:
            inertia = km.inertia_
            order = self.getOrder(inertia)
            return inertia + self.violations * order * 10**2
Exemplo n.º 25
0
import scipy.stats as ss
import math

%matplotlib inline

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
print(Features.shape)
print(Labels.shape)

## -->> Eliminate low variance features
# VarianceThreshold function 
print(Features.shape)

## Define the variance threhold and fit the threshold to the feature array. 
sel = fs.VarianceThreshold(threshold=(.8 * (1 - .8)))
Features_reduced = sel.fit_transform(Features)

## Print the support and shape for the transformed features
print(sel.get_support())
print(Features_reduced.shape)

## -->>Select k best features
# RFECV function
## Reshape the Label array
Labels = Labels.reshape(Labels.shape[0],)

## Set folds for nested cross validation
nr.seed(988)
feature_folds = ms.KFold(n_splits=10, shuffle = True)
Exemplo n.º 26
0
def get_zero_variance_filter(X_train):
    tmp = feature_selection.VarianceThreshold()
    return tmp.fit()
Exemplo n.º 27
0
# 特征变换
for i in range(len(names_without_sex)):
    data[names_without_sex[i]+'_log1p']=data[names_without_sex[i]].map(np.log1p)
    data[names_without_sex[i]+'_sqrt']=data[names_without_sex[i]].map(np.sqrt)
    for j in range(i+1, len(names_without_sex)):
        data[names_without_sex[i]+'*'+names_without_sex[j]]=data[names_without_sex[i]]*data[names_without_sex[j]]
del i,j
# 特征选择
names=data.columns.drop(['id', '血糖'])
    #划分训练集和测试集
train_xs=data.loc[data['血糖'] != 'unknown', names]
train_ys=data.loc[data['血糖'] != 'unknown', '血糖']
test_x  =data.loc[data['血糖'] == 'unknown', names]
test_y  =data.loc[data['血糖'] == 'unknown', ['id', '血糖']]
    #特征选择:方差选择
VarianceThreshold=feature_selection.VarianceThreshold(threshold=0.1).fit(train_xs)
train_xs=VarianceThreshold.transform(train_xs)
test_x  =VarianceThreshold.transform(test_x)
    #特征选择:SelectPercentile
SelectPercentile=feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50).fit(
                                        train_xs, train_ys.map(np.float64))
train_xs=SelectPercentile.transform(train_xs)
test_x  =SelectPercentile.transform(test_x)
# 降维
PCA=decomposition.PCA(n_components=50).fit(train_xs)
train_xs=PCA.transform(train_xs)
test_x  =PCA.transform(test_x)
#标准化
scaler=preprocessing.StandardScaler().fit(train_xs)
train_xs=scaler.transform(train_xs)
test_x  =scaler.transform(test_x)
Exemplo n.º 28
0
 def get_variances(self):
     sel = fs.VarianceThreshold()
     sel.fit(self.data)
     return zip(self.attributes, sel.variances_)
Exemplo n.º 29
0
dir = 'E:/10.kaggle(dont-overfit2)'
train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(train.info())
print(train.columns)

sns.countplot(x='target',data=train)

#filter unique value features
train1 = train.iloc[:,2:] 
y = train['target'].astype(int)

X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1)

stages = [  ('imputer', preprocessing.Imputer()),
            ('zv_filter', feature_selection.VarianceThreshold()),
            ('feature_selector', feature_selection.SelectKBest(score_func=feature_selection.f_classif)),
            ('classifier', linear_model.LogisticRegression())
        ]
pipeline_ml = pipeline.Pipeline(stages)
pipeline_grid  = {'feature_selector__k':[70, 75, 100], 'classifier__C':[0.001, 0.01, 0.1, 0.2, 0.5],'classifier__penalty':['l1', 'l2'], 'classifier__class_weight':['balanced', None]}
pipeline_generated = cutils.grid_search_best_model(pipeline_ml, pipeline_grid, X_train, y_train, scoring="roc_auc")
final_estimator = pipeline_generated.named_steps['classifier']
print(pipeline_generated.score(X_eval, y_eval))

test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(test.info())
print(test.columns)

test1 = test.iloc[:,1:] 
test['target'] = np.round(pipeline_generated.predict_proba(test1)[:,1], 2)
Exemplo n.º 30
0
print('Loading Trainset')
DATA_FOLDER = '../../../data/'
RES_FOLDER = '../../../results/base_features/'

train_x, gene_names_x, cell_names_x = dl.load_data(DATA_FOLDER +
                                                   'train_data.csv.gz')

# TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY
train_x = train_x[:, 1:]
gene_names_x = gene_names_x[1:]
cell_names_x = cell_names_x[1:]

train_y, cell_names_y = dl.load_response(DATA_FOLDER + 'response.csv.gz')

# Preprocessing
varThresh = fsel.VarianceThreshold(threshold=0.1).fit(train_x)
scaled_x = prep.scale(varThresh.transform(train_x))

print('Loading Herring 2017')
herring_x, gene_names_herring, cell_names_herring = dl.load_data(
    DATA_FOLDER + 'herring2017_data.csv.gz')
# TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY
herring_x = herring_x[:, 1:]
gene_names_herring = gene_names_herring[1:]
herring_scaled = prep.scale(varThresh.transform(herring_x))

# Load Joost 2016 Data
print('Loading Joost 2016')
joost_x, gene_names_joost, cell_names_joost = dl.load_data(
    DATA_FOLDER + 'joost2016_data.csv.gz')
# TODO: FIX DATA_LOAD FUNCTION SUCH THAT THIS ISN'T NECESSARY