Пример #1
0
class CustomRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, **estimator_params):
        super().__init__()
        self.selector = SelectKBest(score_func=mutual_info_regression, k="all")
        self.base_model = lightgbm.LGBMRegressor(random_state=24,
                                                 objective="regression_l1")
        self.set_params(**estimator_params)

    def fit(self, X, y=None):
        X_tr = self.selector.fit_transform(X, y)
        self.base_model.fit(X_tr, y)
        return self

    def predict(self, X):
        X_tr = self.selector.transform(X)
        y = self.base_model.predict(X_tr)
        y[X.false_positive == 1] = 1
        y[X.false_negative == 1] = 1
        y[(X.area_model + X.area_expert) == 0] = 5
        y = np.round(y, 0)
        return y

    def get_params(self, **params):
        pars = self.base_model.get_params()
        pars["k"] = self.selector.k
        return pars

    def set_params(self, **params):
        if "k" in params:
            k = params.pop("k")
            self.selector = self.selector.set_params(k=k)
        self.base_model = self.base_model.set_params(**params)
        return self
Пример #2
0
class FeatureSel(BaseEstimator,TransformerMixin):
    def __init__(self,k_best=5,pca_comp=8):
        self.k_best=k_best
        self.pca_comp=pca_comp
        if pca_comp>0:
            self.pca=PCA(n_components=self.pca_comp)
        if k_best>0:
            self.skb=SelectKBest(k=self.k_best)


    def set_params(self, **parameters):

        for parameter, value in parameters.items():
            setattr(self,parameter, value)
        return self

        self.pca.set_params(n_components=self.pca_comp)

        self.skb.set_params(k=self.k_best)

        return self


    def transform(self,X):
        X1=self.pca.transform(X)
        X2=self.skb.transform(X)

        return np.hstack((X1,X2))


    def fit_transform(self,X,y):


        X1=self.pca.fit_transform(X,y)
        X2=self.skb.fit_transform(X,y)

        return np.hstack((X1,X2))

    def fit(self,X,y):
        if self.pca_comp>0:
            self.pca.fit(X,y)
        if self.k_best>0:
            self.skb.fit(X,y)
Пример #3
0
class FeatureSel(BaseEstimator, TransformerMixin):
    def __init__(self, k_best=5, pca_comp=8):
        self.k_best = k_best
        self.pca_comp = pca_comp
        if pca_comp > 0:
            self.pca = PCA(n_components=self.pca_comp)
        if k_best > 0:
            self.skb = SelectKBest(k=self.k_best)

    def set_params(self, **parameters):

        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

        self.pca.set_params(n_components=self.pca_comp)

        self.skb.set_params(k=self.k_best)

        return self

    def transform(self, X):
        X1 = self.pca.transform(X)
        X2 = self.skb.transform(X)

        return np.hstack((X1, X2))

    def fit_transform(self, X, y):

        X1 = self.pca.fit_transform(X, y)
        X2 = self.skb.fit_transform(X, y)

        return np.hstack((X1, X2))

    def fit(self, X, y):
        if self.pca_comp > 0:
            self.pca.fit(X, y)
        if self.k_best > 0:
            self.skb.fit(X, y)
Пример #4
0
def runXVal(X1, Y1, n_ds=None, CLF=None, cv_nfolds=10, DS_top=pd.DataFrame()):
    Perf = []
    FS = SelectKBest(f_classif)  # Feature selection
    SKF = StratifiedKFold(Y1, n_folds=cv_nfolds, shuffle=True)
    if DS_top.shape[0] == 0:
        DS_top = pd.DataFrame(np.zeros(X1.shape[1]), index=X1.columns)

    k_fold = 0

    for I1, I2 in SKF:
        k_fold += 1
        I_train, I_test = X1.index[I1], X1.index[I2]
        FS.set_params(k=n_ds)
        FS.fit_transform(X1.ix[I_train], y=Y1.ix[I_train])

        DS = X1.columns[FS.get_support()]
        DS_top.ix[DS] += 1

        X_train, Y_train = X1.ix[I_train, DS], Y1.ix[I_train]
        X_test, Y_test = X1.ix[I_test, DS], Y1.ix[I_test]
        CLF = getClassifiers()

        for Nm, Clf in CLF.iteritems():
            Clf.fit(X_train, Y_train)
            try:
                Y_pred = Clf.predict(X_test)
            except:
                print " >> Failed", Nm
            else:
                P = dict(lr=Nm,
                         pt='cvt',
                         n_train=len(I_train),
                         n_test=len(I_test),
                         n_ds=n_ds,
                         cv_kfold=cv_nfolds)
                P.update(evalPred(Y_test, Y_pred))
                Perf.append(P)

    return Perf
Пример #5
0
Файл: cw1.py Проект: arlyon/dmml
def feature_sweep(features,
                  boolean_labels,
                  labels,
                  seed,
                  save_plot,
                  show_plot,
                  n_features=20):
    """
    Performs a sweep of top 'n_features' bayesian features per label, top 'n_features' bayesian features overall
    and the top 'n_features' selected by the k-best selector using the f_classifier function.
    """
    model = KMeans(n_clusters=10)

    # Finds the best performing features per label from the naive bayesian net
    print("Performing sweep of top bayesian features per label...")
    bayes_classifiers = fit_labels(features, boolean_labels)
    bayes_per_label_analysis = []
    with click.progressbar(range(n_features // 10)) as feature_range:
        for n in feature_range:
            top_n_features = set(
                itertools.chain.from_iterable(
                    x.top_features[:n + 1]
                    for x in bayes_classifiers.values()))
            numpy.random.set_state(seed)
            predictions = model.fit_predict(
                features[(str(x) for x in top_n_features)])
            bayes_per_label_analysis.append(
                (len(top_n_features), score_clustering(labels, predictions)))

    # Finds the best performing features overall from the naive bayesian net
    print("Performing sweep of top bayesian features overall...")
    bayes_classifier = bayesian_classification(features,
                                               labels,
                                               n_correlated=n_features + 1)
    bayes_overall_analysis = []
    with click.progressbar(range(1, n_features + 1)) as feature_range:
        for n in feature_range:
            top_n_features = bayes_classifier.top_features[:n]
            numpy.random.set_state(seed)
            predictions = model.fit_predict(
                features[(str(x) for x in top_n_features)])
            bayes_overall_analysis.append(
                (n, score_clustering(labels, predictions)))

    # Finds the best perfroming features from the K-Best algorithm using the f_classifier scoring function
    print("Performing sweep of K-best features...")
    k_best_analysis = []
    selector = SelectKBest()
    selector.fit(features, column_or_1d(labels))
    with click.progressbar(range(1, n_features + 1)) as feature_range:
        for n in feature_range:
            selector.set_params(k=n)
            selected_features = selector.transform(features)
            numpy.random.set_state(seed)
            predictions = model.fit_predict(selected_features)
            k_best_analysis.append((n, score_clustering(labels, predictions)))

    # Plots the data gathered from the sweeps aboveß
    if show_plot or save_plot:
        handles = []
        plot_info = [(bayes_per_label_analysis, 'r', "Bayes per Label"),
                     (bayes_overall_analysis, 'g', "Bayes Overall"),
                     (k_best_analysis, 'b', "K Best")]
        for data, colour, name in plot_info:
            data = list(zip(*[(x, *y.values()) for x, y in data]))
            handles += plt.plot(data[0],
                                data[3],
                                '-' + colour,
                                label=name + " V Score")
            handles += plt.plot(data[0],
                                data[4],
                                '--' + colour,
                                label=name + " Rand")
        plt.legend(handles, loc="lower left")
        plt.xlabel("Number of Features")
        plt.title("Comparison of Feature Selection Algorithms")
        if save_plot is not None:
            path = os.path.join(save_plot, "feature_sweep.png")
            plt.savefig(path)
            print("")
            print("saved figure to " + path)
        if show_plot:
            plt.show()
        plt.clf()

    # Plots a heatmap of the results of the f_classifier scoring function
    plt.imshow(selector.scores_.reshape(48, -1),
               cmap='hot',
               interpolation='lanczos')
    plt.title("K-Best Feature Heatmap")
    if save_plot is not None:
        path = os.path.join(save_plot, "k_best_heatmap.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()

    # Returns the best performing number of features for the K-best selector
    score = [
        v_score + rand for _, _, v_score, rand in
        [scores.values() for (_, scores) in k_best_analysis]
    ]
    score = numpy.argmax(score)
    print(f"Best performance out of {n_features} features: {score}")
    return score
Пример #6
0
def runML1(Data_cv,
           y,
           dt_out=None,
           dt_in=None,
           n_ds=10,
           cv_nfolds=10,
           cv_iters=5,
           CLF=None,
           DS_top=None,
           save=False,
           Col_ds=None,
           Col_lr=None):
    """
    Data_cv: Data for cross validation
    y      : Column of Data_cv with classification attribute
    n_ds   : number of top descriptors to use
    """

    # Data For CV
    X1, Y1 = Data_cv.drop(y, axis=1), Data_cv[y]
    ID_cv = X1.index
    cls_ds = y
    Neg = Data_cv.index[Data_cv[cls_ds] == 0]
    Pos = Data_cv.index[Data_cv[cls_ds] == 1]
    n_n = len(Neg)
    n_p = len(Pos)

    if not DS_top:
        DS_top = pd.DataFrame(np.zeros(X1.shape[1]),
                              index=X1.columns,
                              columns=['N'])

    # The dataset
    R_ds = dict(pred=y,
                dt_in=dt_in,
                dt_out=dt_out,
                n_ds=n_ds,
                n_obs=X1.shape[0],
                n_neg=n_n,
                n_pos=n_p,
                cvt=dict(pos=list(Pos), neg=list(Neg)))

    if Col_ds:
        ds_id = Col_ds.save(R_ds)

    Perf = []

    # Iterate through cross-validation
    for k_cv in range(cv_iters):
        P = runXVal(X1, Y1, n_ds=n_ds, cv_nfolds=cv_nfolds, DS_top=DS_top)
        Perf += P

    P1 = summarizePerf(Perf)
    P1['n_iter'] = cv_iters
    P1a = {i['lr']: i for i in P1.to_dict('records')}

    # Store the full classifiers
    CLF = getClassifiers()
    FS = SelectKBest(f_classif)
    FS.set_params(k=n_ds)
    FS.fit_transform(X1, y=Y1)
    DS = X1.columns[FS.get_support()]
    DS_top.sort('N', ascending=False, inplace=True)
    DS = DS_top.ix[:n_ds].index

    LR_db = {}
    for Nm, Clf in CLF.iteritems():
        Clf.fit(X1[DS], Y1)
        try:
            Y = Clf.predict(X1[DS])
        except:
            print " >> Training evaluation failed", Nm
        else:
            R = dict(pred=y,
                     dt_in=dt_in,
                     dt_out=dt_out,
                     n_ds=n_ds,
                     lr=Nm,
                     n_pos=n_p,
                     n_neg=n_n,
                     n_obs=X1.shape[0])
            P = dict(pt='all_obs')
            P.update(evalPred(Y1, Y))
            P['ds'] = list(DS)
            R['perf_trn'] = P
            R['perf_cvt'] = P1a[Nm]
            #R['clf_pkl']=pickle.dumps(Clf)
            #R['qmrf']   = 'TBD'
            #R['ds_id'] = ds_id,
            LR_db[Nm] = Col_lr.insert(R)
Пример #7
0
"""
### INITIAL TUNING OF PARAMETERS, including selection of features. optimum k found to be 8. 

### set params for gridsearch
params = dict(feat_select__k = range(4, len(features_list)-1), svm__gamma=[0.1, 0.5, 1], 
svm__C=[1,10,100], svm__kernel=['rbf', 'poly', 'sigmoid'])

estimator = grid_search.GridSearchCV(pipe,  param_grid=params, scoring = 'f1', cv=cv)
estimator.fit(features_train, labels_train)
print estimator.best_params_
#pprint.pprint(estimator.grid_scores_)

"""

### identify best k features (k=8) used in POI identifier and save to updated_features list
feat_select.set_params(k=13)
feat_select.fit(features_train, labels_train)
array1 = feat_select.get_support()
array2 = feat_select.scores_

print "Scores for features selected using SelectKBest: "
updated_features = ['poi']
for i in range(0,len(array1)):
	if array1[i]==True:
		print features_list[i+1], array2[i]
		updated_features.append(features_list[i+1])

### store updated_features in features_list for export
features_list = updated_features
print "Features used in classifier: ", features_list
Пример #8
0
class SelectKBest(FeatureSelectionAlgorithm):
    r"""Implementation of feature selection using selection of k best features according to used score function.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
    
    See Also:
        * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm`
    """
    Name = 'Select K Best'

    def __init__(self, **kwargs):
        r"""Initialize SelectKBest feature selection algorithm.

        Notes:
            _params['k'] is initialized to None as it is included in the optimization process later since we cannot determine a proper value range until length of the feature vector becomes known.
        """
        self._params = dict(score_func=ParameterDefinition(
            [chi2, f_classif, mutual_info_classif]),
                            k=None)
        self.__k = None
        self.__select_k_best = SelectKB()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__select_k_best.set_params(**kwargs)

    def select_features(self, x, y, **kwargs):
        r"""Perform the feature selection process.

        Arguments:
            x (pandas.core.frame.DataFrame): Array of original features.
            y (pandas.core.series.Series) Expected classifier results.

        Returns:
            numpy.ndarray[bool]: Mask of selected features.
        """
        if self.__k is None:
            self.__k = x.shape[1]
            self._params['k'] = ParameterDefinition(MinMax(1, self.__k),
                                                    np.int)
            val = np.int(np.around(np.random.uniform(1, self.__k)))
            self.__select_k_best.set_params(k=val)

        self.__select_k_best.fit(x, y)
        return self.__select_k_best.get_support()

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureSelectionAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__select_k_best.get_params()))