Пример #1
0
def select_features(data, neg_cmpd, pos_cmpd, compound_col="Metadata_compound",
                    C=0.01):
    """
    Return selected features basd on L1 linear svc.

    Parameters
    -----------
    data : pandas DataFrame
    neg_cmpd : string
        name of negative control in compound_col
    pos_cmpd : string
        name of positive control in compound_col
    compound_col : string
        name of column in data that contains compound labels
    C : float (default=0.01)
        Sparsity, lower the number the fewer features are selected

    Returns
    -------
    selected_features : list
        Selected features
    """
    X, Y = _split_classes(data, neg_cmpd, pos_cmpd, compound_col)
    lin_svc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, Y)
    model = SelectFromModel(lin_svc, prefit=True)
    feature_mask = np.array(model.get_support())
    feature_names = np.array(X.columns.tolist())
    selected_features = list(feature_names[feature_mask])
    return selected_features
Пример #2
0
class ModelFeatureSelectionWrapper(BaseEstimator):
    def __init__(self, estimator, inner_model, feature_selection_threshold_coef=3):
        self.estimator=estimator
        self.inner_model = inner_model
        self.feature_selector = None
        self.feature_selection_threshold_coef = feature_selection_threshold_coef

    def _get_feature_selector(self):
        if self.feature_selector is None:
            self.feature_selector = SelectFromModel(self.estimator,
                                                    threshold='{}*mean'.format(float(self.feature_selection_threshold_coef)))
        return self.feature_selector

    def get_support(self, indices=False):
        feature_selector_support = self.feature_selector.get_support(indices=True)
        inner_support = self.inner_model.get_support(indices=True)
        return get_support_for_feature_selection_wrapper(
            feature_selector_support,
            inner_support,
            indices,
        )


    def fit(self, X, y):
        print X, X.shape
        X = self._get_feature_selector().fit(X.copy(), y.copy()).transform(X.copy())
        self.inner_model.fit(X.copy(), y)
        return self

    def predict(self, X):
        X = self._get_feature_selector().transform(X.copy())
        return self.inner_model.predict(X.copy())
def final_feats(df_data):
    x_train = df_data.iloc[:,1:370] #removing the "ID" and the "Target" columns

    

    """Getting the first 2 PCs""" 
    pca = PCA(n_components=2)
    x_train_projected = pca.fit_transform(normalize(x_train, axis=0))
   
    x_train, del_constants = remove_feat_constants(x_train) 
    """ removing columns with no 
    variance; in our case the all-zero columns"""
    x_train, del_identicals = remove_feat_identicals(x_train)
    """removing columns that are identical to each other, and retainining
    only one of them"""
    y_train = df_data["TARGET"]


# Using L1 based feature selection on X_train with 308 columns
    lsvc = svm.LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train, y_train)
    model = SelectFromModel(lsvc, prefit=True)
    feat_ix_keep = model.get_support(indices=True) #getting indices of selected features
#so that I don't have to use "transform" and convert the data frame to a matrix.
    orig_feat_ix = np.arange(x_train.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)

    X_train_new = x_train.drop(labels=x_train.columns[feat_ix_delete],
                                 axis=1)
    X_train_new.insert(1, 'PCAOne', x_train_projected[:, 0])
    X_train_new.insert(1, 'PCATwo', x_train_projected[:, 1])
    return X_train_new, y_train, feat_ix_keep, pca, del_constants, del_identicals                             
Пример #4
0
class SelectFromModelSelection(SelectionModel):
    name = "SelectFromModel"

    def __init__(self, *args):
        SelectionModel.__init__(self, *args)
        self.selector = SelectFromModel(self.estimator)
        self.selector.fit(self.x_array, self.y_array)
        self.support_ = self.selector.get_support()
def rf_feat_reduction(rf_model, features):

    print " Reducing number of input features based on feature importance."
    subset_model = SelectFromModel(rf_model, prefit=True)
    feat_subset = subset_model.transform(features)
    feat_bool = subset_model.get_support()
    print " " + str(len(feat_subset[0])) + " features chosen after model selection."
    return feat_subset, feat_bool
Пример #6
0
def logistic_l1(X, y, tol):
    DEBUG = False
    # if DEBUG: print 'X ',X,' y ',y,' tol ',tol
    lr = LogisticRegression(penalty='l1', C=0.65,
                            dual=False)
    model = SelectFromModel(lr,
                            prefit=False,
                            threshold=tol)
    if DEBUG: print X.shape, y.shape
    x_select = model.fit_transform(X, y)
    x_logreg = lr.fit(X, y)
    x_logreg_trans = lr.predict(X)
    x_irls = irls(X, y)
    support = model.get_support(indices=True)
    if DEBUG: print 'support', support, 'x_select', x_select, 'x_logreg', x_logreg_trans, 'x_irls', x_irls
    if DEBUG: print 'len_support', len(support), 'len_x_select', len(x_select), 'len_x_logreg', len(
        x_logreg_trans), 'len_x_irls', len(x_irls)
    if DEBUG: print 'x_logreg_coef', x_logreg.coef_, 'len', len(x_logreg.coef_[0]), 'intercept', x_logreg.intercept_

    return x_logreg.coef_[0]
Пример #7
0
def feature_select(clf):
    cvscore = np.mean(cross_val_score(clf, X, t))
    clf.fit(X, t)
    try:
        feature_importances = list(reversed(np.array(features)[np.argsort(clf.feature_importances_)]))
    except:
        feature_importances = None
    selection_results = {'mean' : dict(), 'median' : dict()}
    scalings = [0, 0.25, 0.5, 0.75, 0.9, 1, 1.1, 1.25, 1.5, 1.75, 2]
    for scaling in scalings:
        X_new = SelectFromModel(clf, threshold=str(scaling)+'*mean', prefit=True).transform(X)
        selection_results['mean'][scaling] = np.mean(cross_val_score(clf, X_new, t))
        X_new = SelectFromModel(clf, threshold=str(scaling)+'*median', prefit=True).transform(X)
        selection_results['median'][scaling] = np.mean(cross_val_score(clf, X_new, t))
    best_select = max(itertools.product(['mean', 'median'], scalings), key = lambda (m,s) : selection_results[m][s])
    model = SelectFromModel(clf, threshold=str(best_select[1]) + '*' + best_select[0], prefit=True)
    X_new = model.transform(X)
    feature_mask = model.get_support()
    cvscore_selected = np.mean(cross_val_score(clf, X_new, t))
    clf.fit(X_new, t)
    return cvscore, feature_importances, best_select, feature_mask, cvscore_selected, clf
Пример #8
0
def how_many_variables_used(word_list, inputs, outputs, num_vars, l1_step=LinearSVC(penalty='l1', dual=False, C=1)):
    kf = KFold(inputs.shape[0], n_folds=10, shuffle=True)
    for train_indices, val_indices in kf:
        # pipeline = Pipeline([('chi2_top_k', SelectKBest(chi2, num_vars)),
        #                      ('l1_step', SelectFromModel(l1_step))])
        kbest = SelectKBest(chi2, num_vars)
        l1_selector = SelectFromModel(l1_step)

        x_new = kbest.fit_transform(inputs[train_indices], outputs[train_indices].ravel())
        indices = kbest.get_support(indices=True)

        x_new = l1_selector.fit_transform(x_new, outputs[train_indices].ravel())
        new_indices = l1_selector.get_support(indices=True)

        from sklearn.ensemble import ExtraTreesClassifier
        model = ExtraTreesClassifier()
        model.fit(x_new, outputs[train_indices].ravel())
        importance = np.argsort(model.feature_importances_)[::-1]

        print([word_list[indices[i]] for i in new_indices])
        print([word_list[indices[new_indices[i]]] for i in importance])
        print(x_new.shape)
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rng = np.random.RandomState(1)
X = rng.randint(0, 2, (200, 20))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

fs_univariate = SelectKBest(k=10)
fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median')

fs_univariate.fit(X, y)
print('Features selected by univariate selection:')
print(fs_univariate.get_support())
print('')

fs_modelbased.fit(X, y)
print('Features selected by model-based selection:')
print(fs_modelbased.get_support())
Пример #10
0
def get_C_panel(method, features, labels, C):
    model = SelectFromModel(method(C).fit(
        mt.normalize_features(features, normal="scaled"), labels),
                            prefit=True)
    return (model.get_support(True))
Пример #11
0
def logistic_dimension(data, label, parameter=1):
    logistic_ = LogisticRegression(penalty="l1", C=parameter, max_iter=30)
    model = SelectFromModel(logistic_)
    new_data = model.fit_transform(data, label)
    mask = model.get_support(indices=True)
    return new_data, mask
#df.TARGET.describe()

y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values

lr = LassoLarsCV()
sfm = SelectFromModel(lr, threshold=1e-3)
X_std = StandardScaler().fit_transform(X, y)
sfm.fit(X_std,y)
lr.fit(X_std, y)

#feat_imp = pd.DataFrame(lr.coef_, index=X_labels)
#feat_imp.plot(kind="bar", title="Feature Importance", use_index=False)

chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ]
#chosen_feat = pickle.load(open("feat", "rb"))
print(len(chosen_feat))
chosen_feat

# kaggle forum
df.var3 = df.var3.replace(-999999,2)
y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values


test = pd.read_csv("processed_test.csv", header=0, index_col="ID")
test.var3 = test.var3.replace(-999999,2)

X_test = test[chosen_feat].values
Пример #13
0
def regressor(raster_path, vector_path, vector_field, newRasterfn, global_src_extent=False):

    # open raster file
    rds = gdal.Open(raster_path, GA_ReadOnly)

    # get geo data
    rgt = rds.GetGeoTransform()
    assert(rds)

    # get number of bands
    bands = rds.RasterCount
    myBlockSize=rds.GetRasterBand(1).GetBlockSize();
    x_block_size = myBlockSize[0]
    y_block_size = myBlockSize[1]

    # Get image sizes
    cols = rds.RasterXSize
    rows = rds.RasterYSize
    geotransform = rds.GetGeoTransform()
    originX = geotransform[0]
    originY = geotransform[3]
    pixelWidth = geotransform[1]
    pixelHeight = geotransform[5]

    # we need this for file creation
    outRasterSRS = osr.SpatialReference()
    outRasterSRS.ImportFromWkt(rds.GetProjectionRef())
    # get datatype and transform to numpy readable
    data_type = rds.GetRasterBand(1).DataType
    data_type_name = gdal.GetDataTypeName(data_type)

    if data_type_name == "Byte":
        data_type_name = "uint8"

    # open vector file
    vds = ogr.Open(vector_path, GA_ReadOnly)  # TODO maybe open update if we want to write stats
    assert(vds)
    # get the layer
    vlyr = vds.GetLayer(0)

    # count valid features
    c = 0
    feat = vlyr.GetNextFeature()
    while feat is not None:
        label = feat.GetField(vector_field)
        if label is not None:
            c = c + 1
            nr_of_feat = c

        feat = vlyr.GetNextFeature()

    feat = vlyr.ResetReading()
    print " Reading the training data ..."
    print " Number of training samples:" + str(nr_of_feat)

    # create a python list to fill during subsequent loop for mean and training data
    mean = [[0 for _ in range(bands)] for _ in range(nr_of_feat)]
    training = []

    print " Extracting band values for each training sample ..."
    # extract mean value for each polygon on each band (zonal stats function)
    for x in xrange (1, bands + 1):

        rb = rds.GetRasterBand(x)
        nodata_value = rb.GetNoDataValue()

        if nodata_value:
            nodata_value = float(nodata_value)
            rb.SetNoDataValue(nodata_value)

        # create an in-memory numpy array of the source raster data
        # covering the whole extent of the vector layer
        if global_src_extent:
            # use global source extent
            # useful only when disk IO or raster scanning inefficiencies are your limiting factor
            # advantage: reads raster data in one pass
            # disadvantage: large vector extents may have big memory requirements
            src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent())
            src_array = rb.ReadAsArray(*src_offset)

            # calculate new geotransform of the layer subset
            new_gt = (
                (rgt[0] + (src_offset[0] * rgt[1])),
                rgt[1],
                0.0,
                (rgt[3] + (src_offset[1] * rgt[5])),
                0.0,
                rgt[5]
            )

        mem_drv = ogr.GetDriverByName('Memory')
        driver = gdal.GetDriverByName('MEM')

        # reset feature reading for every band
        feat = vlyr.ResetReading()
        # get the first feature (subsequent features call is in the loop)
        feat = vlyr.GetNextFeature()

        # loop through the features and keep an index i per loop
        i = 0
        while feat is not None:

            label = feat.GetField(vector_field)
            #print "label: " + str(label)

            #label2 = feat.GetField("Id")
            #print "label2: " + str(label2)


            if label is not None:
            # extract the training (we only need once)
                if x == 1:
                    label = feat.GetField(vector_field)
                    training.append(feat.GetField(vector_field))

                # extract the band values
                if not global_src_extent:
                    # use local source extent
                    # fastest option when you have fast disks and well indexed raster (ie tiled Geotiff)
                    # advantage: each feature uses the smallest raster chunk
                    # disadvantage: lots of reads on the source raster
                    src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope())
                    src_array = rb.ReadAsArray(*src_offset)

                    # calculate new geotransform of the feature subset
                    new_gt = (
                        (rgt[0] + (src_offset[0] * rgt[1])),
                        rgt[1],
                        0.0,
                        (rgt[3] + (src_offset[1] * rgt[5])),
                        0.0,
                        rgt[5]
                    )

                # Create a temporary vector layer in memory
                mem_ds = mem_drv.CreateDataSource('out')
                mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon)
                mem_layer.CreateFeature(feat.Clone())

                # Rasterize it
                rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte)
                rvds.SetGeoTransform(new_gt)
                gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1])
                rv_array = rvds.ReadAsArray()

                # Mask the source data array with our current feature
                # we take the logical_not to flip 0<->1 to get the correct mask effect
                # we also mask out nodata values explictly
                masked = np.ma.MaskedArray(
                    src_array,
                    mask=np.logical_or(
                        src_array == nodata_value,
                        np.logical_not(rv_array)
                    )
                )

                band_mean = str(x) + "_mean"

                # index array by bands and feature
                ar_row = i
                ar_col = x - 1
                i = i + 1

                # fill the array with the respective values
                mean[ar_row][ar_col] = float(masked.mean())

            rvds = None
            mem_ds = None
            feat = vlyr.GetNextFeature()

    # python lists to numpy array
    training = np.array(training)
    mean = np.array(mean)

    # we do not need the vector layr anymore
    vds = None

    # prepare paramtere testing
    PARAMETER_GRID = [
        (50, 75), #, 100, 125, 150, 200), # nr. of estimators
        ('auto', 'sqrt'), #, 'log2'),     # max nr. of features
        (1, 2), #, 3, 5)                  # min nr. of leaves
    ]

    # set a preliminary score
    best_score = float("-inf")
    best_tot_score = float("-inf")
    best_r2 = float("-inf")

    print " Testing for different parameter sets of the RF classifier with all features ..."
    for n, f, l in product(*PARAMETER_GRID):
        print " Combination of " + str(n) + " estimators, " + str(f) + " feature subset and " + str(l) + " as minimum number of leaves"
        # create the rf classifier
        rf_initial = RandomForestRegressor(n_estimators=n,
                                   max_features=f ,
                                   min_samples_leaf=l,
                                   oob_score=True,
                                   n_jobs=-1)
        # Fit our model to training data
        rf_initial.fit(mean,training)
        splits = int(round(nr_of_feat / 5))
        cv_predicted = cross_validation.cross_val_predict(rf_initial, mean, training,  cv=splits)
        r2 = r2_score(training, cv_predicted)
        print " oob model: " + str(rf_initial.oob_score_)
        print " r^2 model: " + str(r2)
        tot_score = (2 * r2 + rf_initial.oob_score_) / 3

        if tot_score > best_tot_score:
            best_tot_score=tot_score
            best_r2 = r2
            best_score = rf_initial.oob_score_
            print " best oob: " + str(best_score)
            print " best r^2: " + str(r2)
            rf = rf_initial
            est, features, leaves = n, f, l

    # get OOB score and score
    oob = rf.oob_score_
    score = rf.score(mean, training)

    # print results of best model
    print( '-------------------------------- ')
    print( ' 1)     Best model using all features:')
    print( '-------------------------------- ')
    print( '   RF paramters: ')
    print( '      Number of estimators: ' + str(est))
    print( '      Max. number of features: ' + str(features))
    print( '      Min. number of samples per leave: ' + str(leaves))
    print( '' )
    print( '   R^2 model score: ' + str(score))
    print( '   OOB prediction score: ' + str(oob))
    print( '   R^2 cross-val score: ' + str(best_r2))
    print( '--------------------------------')

    # create stats and figure files
    outname_fi = os.path.basename(newRasterfn)
    outname_fi = outname_fi.replace(' ', '')[:-4]
    outpath_fi = os.path.dirname(newRasterfn)
    text_file = outpath_fi + '/Stats.' + outname_fi + '.txt'
    fig_file = outpath_fi + '/FeatImp.' + outname_fi + '.jpg'
    fig_file2 = outpath_fi + '/FeatImp.reduced.' + outname_fi + '.jpg'

    # write to stats file
    f = open( text_file, 'w' )
    f.write( '-------------------------------- \n')
    f.write( '1) Best model using all features: \n')
    f.write( '-------------------------------- \n')
    f.write( '   RF parameters: \n')
    f.write( '      Number of estimators: ' + str(est) + ' \n')
    f.write( '      Max. number of features: ' + str(features) + ' \n')
    f.write( '      Min. number of samples per leave: ' + str(leaves) + ' \n')
    f.write( '\n' )
    f.write( '   R^2 model score: ' + str(score) + '\n')
    f.write( '   OOB prediction score: ' + str(oob) + ' \n' )
    f.write( '   R^2 cross-val score: ' + str(best_r2) + '\n' )
    f.write( '-------------------------------- \n')

    print('   The importance of our bands are:')
    f.write('   The importance of our bands are:\n')
    #get band importance
    imps=[]
    bands = range(1, bands + 1)
    for b, imp in zip(bands, rf.feature_importances_):
        print('      Band {b} importance: {imp}'.format(b=b, imp=imp))
        f.write('      Band ' + str(b) + ' importance: ' + str(imp) + '\n' )
        imps.append(imp)

    if "SEPAL" not in os.environ:
        # create a plot for the feature importance
        index = np.arange(b)
        bar_width=0.8
        fig, ax = plt.subplots()
        plt.bar(index + 0.6, imps, bar_width,
                alpha=0.4,
                color='b')
        ax.set_xlabel('Band number')
        ax.set_ylabel('Score')
        plt.xticks(index + 1)
        ax.set_title('Feature importance for RF regressor')

        # save plot to file
        plt.savefig(fig_file)
        plt.show()

    print " Reducing number of input features based on feature importance."
    feat_subset = SelectFromModel(rf, prefit=True)
    mean_new = feat_subset.transform(mean)
    feat_bool = feat_subset.get_support()
    print " " + str(len(mean_new[0])) + " features chosen after model selection."


    print " Testing for different parameter sets of the RF classifier with the selected features ..."
    for n, mf, l in product(*PARAMETER_GRID):
        print " Combination of " + str(n) + " estimators, " + str(mf) + " feature subset and " + str(l) + " as minimum number of leaves"
        # create the rf classifier
        rf_opt = RandomForestRegressor(n_estimators=n,
                                   max_features=mf ,
                                   min_samples_leaf=l,
                                   oob_score=True,
                                   n_jobs=-1)
        # Fit our model to training data
        rf_opt.fit(mean_new,training)
        splits = int(round(nr_of_feat / 5))
        cv_predicted = cross_validation.cross_val_predict(rf_opt, mean_new, training,  cv=splits)
        r2 = r2_score(training, cv_predicted)
        print " oob model: " + str(rf_opt.oob_score_)
        print " r^2 model: " + str(r2)
        tot_score = (2 * r2 + rf_opt.oob_score_) / 3

        if tot_score > best_tot_score:
            best_tot_score=tot_score
            best_r2 = r2
            best_score = rf_opt.oob_score_
            print " best oob : " + str(best_score)
            print " best r^2 : " + str(best_r2)
            rf = rf_opt
            print rf
            est, features, leaves = n, mf, l
            mean = mean_new

    # get OOB score and score
    oob = rf.oob_score_
    score = rf.score(mean, training)

    print rf.feature_importances_

    if mean.shape != mean_new.shape:
        print( '------------------------------------- ')
        print( ' No improvements by feature reduction. ')
        print( '------------------------------------- ')

        f.write( '------------------------------------- \n')
        f.write( ' No improvements by feature reduction. \n')
        f.write( '------------------------------------- \n')
    else:
        # print results of best model
        print( '-------------------------------- ')
        print( ' 1) Best model using reduced set of features:')
        print( '-------------------------------- ')
        print( '   RF paramters: ')
        print( '      Number of estimators: ' + str(est))
        print( '      Max. number of features: ' + str(features))
        print( '      Min. number of samples per leave: ' + str(leaves) + '\n')
        print( '   R^2 model score: ' + str(score))
        print( '   OOB prediction score: ' + str(oob))
        print( '   R^2 cross-val score: ' + str(best_r2))
        print( '-------------------------------- ')

        f.write( '\n')
        f.write( '\n')
        f.write( '-------------------------------- \n')
        f.write( ' 2) Best model using reduced set of features: \n')
        f.write( '-------------------------------- \n')
        f.write( '   RF paramters: \n')
        f.write( '      Number of estimators: ' + str(est) + ' \n')
        f.write( '      Max. number of features: ' + str(features) + ' \n')
        f.write( '      Min. number of samples per leave: ' + str(leaves) + ' \n')
        f.write( '' )
        f.write( '   R^2 model score: ' + str(score) + ' \n' )
        f.write( '   OOB prediction score: ' + str(oob) + ' \n' )
        f.write( '   R^2 cross-val score: ' + str(best_r2) + '\n' )
        f.write( '-------------------------------- \n')

        print('   The importance of our bands are:')
        f.write('   The importance of our bands are:\n')

        imps=[]
        bands=[]
        j=0
        for i in xrange(len(feat_bool)):
            if feat_bool[i] == True:
                band=i+1
                #get band importance
                imp=rf.feature_importances_[j]
                print('      Band ' + str(band) + ' importance: ' + str(imp))
                f.write('      Band ' + str(band) + ' importance: ' + str(imp) + '\n')
                j = j + 1
                imps.append(imp)
                bands.append(band)

        if "SEPAL" not in os.environ:
            # create a plot for the feature importance
            index = np.arange(j)
            bar_width=0.8
            fig, ax = plt.subplots()
            plt.bar(index + 0.6, imps, bar_width,
                    alpha=0.4,
                    color='b')
            ax.set_xlabel('Band number')
            ax.set_ylabel('Score')
            plt.xticks(index + 1, bands)
            #plt.xticks(bands)
            ax.set_title('Feature importance for RF regressor')

            # save plot to file
            plt.savefig(fig_file2)
            plt.show()

    print " Cross-validating the final model (Leave-5-out CV) ..."
    splits = int(round(nr_of_feat / 5))
    cv_predicted = cross_validation.cross_val_predict(rf, mean, training,  cv=splits)
    cv_score = cross_validation.cross_val_score(rf, mean, training, cv=splits, scoring='r2', n_jobs=-1)

    # calculate some quality criteria
    r2 = r2_score(training, cv_predicted)
    mse = mean_squared_error(training, cv_predicted)
    rmse = sqrt(mse)
    mae = mean_absolute_error(training, cv_predicted)
    mape = np.mean(np.abs((training - cv_predicted) / training)) * 100
    evs =  explained_variance_score(training, cv_predicted, multioutput = 'uniform_average')

    print('--------------------------------')
    print(' Final Model cross-validation')
    print('--------------------------------')
    print( " R^2: " + str(r2))
    print( " MAE: " + str(mae))
    print( " MAPE: " + str(mape))
    print( " MSE: " + str(mse))
    print( " RMSE: " + str(rmse))
    print( " EVS: " + str(evs))
    print('--------------------------------')
    print( " Accuracy: %0.2f (+/- %0.2f)" % (cv_score.mean(), cv_score.std() * 2))
    print('--------------------------------')

    f.write('-------------------------------- \n')
    f.write(' Final Model cross-validation\n')
    f.write('--------------------------------\n')
    f.write( " R^2: " + str(r2) + '\n')
    f.write( " MAE: " + str(mae) + '\n')
    f.write( " MAPE: " + str(mape) + '\n')
    f.write( " MSE: " + str(mse) + '\n')
    f.write( " RMSE: " + str(rmse) + '\n')
    f.write( " EVS: " + str(evs) + '\n')
    f.write('--------------------------------\n')
    f.write( " Accuracy: %0.2f (+/- %0.2f)\n" % (cv_score.mean(), cv_score.std() * 2))
    f.write('--------------------------------\n')
    # close our stats file
    f.close()


    # write cross validation data to file


    d = {'measured': training, 'predicted': cv_predicted}
    df = DataFrame(data=d)
    df.to_csv(outpath_fi + '/CV.' + outname_fi + '.csv', ';')

    if "SEPAL" not in os.environ:
        # create a cross-val plot
        y = training
        fig, ax = plt.subplots()
        ax.scatter(training, cv_predicted, edgecolors=(0, 0, 0))
        ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
        ax.set_xlabel('Measured')
        ax.set_ylabel('Predicted')
        plt.show()

    print " Create empty output file ..."
    # create out array
    driver = gdal.GetDriverByName('GTIff')
    outRaster = driver.Create(newRasterfn, cols, rows, 1, gdal.GDT_Float32 ,
        options=[           # Format-specific creation options.
        'BIGTIFF=IF_SAFER',
        'BLOCKXSIZE=128'   # must be a power of 2
        'BLOCKYSIZE=128' #    ,  # also power of 2, need not match BLOCKXSIZEBLOCKXSIZE
#        'COMPRESS=LZW'
        ] )
    outRaster.SetGeoTransform((originX, pixelWidth, 0, originY, 0, pixelHeight))
    outband = outRaster.GetRasterBand(1)
    outRaster.SetProjection(outRasterSRS.ExportToWkt())
    outRaster.GetRasterBand(1).SetNoDataValue(0)

    #
    print " Predicting the model to the dataset and write to output band ..."
    #classify by raster blocksize
    #loop through y direction
    r = 1
    for y in xrange(0, rows, y_block_size):
        if y + y_block_size < rows:
            ysize = y_block_size
        else:
            ysize = rows - y

        # loop throug x direction
        for x in xrange(0, cols, x_block_size):
            if x + x_block_size < cols:
                xsize = x_block_size
            else:
                xsize = cols - x

            # create empty
            img = np.empty((ysize, xsize, len(mean[0])), dtype=data_type_name)

            # loop through the timeseries and fill the stacked array part
            if mean.shape == mean_new.shape:
                # read input according to feature reduction
                j=0
                for i in xrange(len(feat_bool)):
                    if feat_bool[i] == True:
                        i += 0
                        img[:,:,j] = np.array(rds.GetRasterBand(i+1).ReadAsArray(x,y,xsize,ysize))
                        bands[j]=i+1
                        j = j + 1
            else:
                # read full input
                for i in xrange( rds.RasterCount ):
                    i += 0
                    img[:,:,i] = np.array(rds.GetRasterBand(i+1).ReadAsArray(x,y,xsize,ysize))

            # for later masking
            min_val = np.min(img, axis=2)

            # reshape the stacked array for actual classification
            new_shape = (img.shape[0] * img.shape[1], img.shape[2])
            img_as_array = img.reshape(new_shape)

            # do the classification
            classification = rf.predict(img_as_array)

            # Reshape our classification map
            classification = np.array(classification.reshape(img[:, :, 0].shape))

            # mask out data where on eof the values is 0
            classification[min_val == 0] = 0.

            # write part of the array to file
            outband.WriteArray(classification, x, y)

            print (" Run: " + str(r) )
            r = r + 1
        train_Y = label_dict.transform(train_Y)
        pd.to_pickle([train_X, train_Y], util.features_prefix + "/size_XY.pkl")
    else:
        [train_X, train_Y] = pd.read_pickle(util.features_prefix + "/size_XY.pkl")
        # 99 + 380 + 7*5*2 + 2
        print len(train_X[0]), len(train_Y)

    if os.path.exists(util.features_prefix + "/salary_XY.pkl") is False:
        train_Y = list(train["predict_salary"].values)
        label_dict = LabelEncoder().fit(train_Y)
        label_dict_classes = len(label_dict.classes_)
        train_Y = label_dict.transform(train_Y)
        pd.to_pickle([train_X, train_Y], util.features_prefix + "/salary_XY.pkl")
    else:
        [train_X, train_Y] = pd.read_pickle(util.features_prefix + "/salary_XY.pkl")
        99 + 380 + 7*5*2 + 2
        from sklearn.tree import DecisionTreeClassifier

        clf = DecisionTreeClassifier(max_depth=3)
        clf.fit(np.array(train_X[:100]), np.array(train_Y[:100]))
        print clf.predict(np.array(train_X[100:200]))
        print train_Y[100:200]
        from sklearn.feature_selection import SelectFromModel

        model = SelectFromModel(clf, prefit=True)
        list_1 = model.get_support()
        for i in range(len(list_1)):
            if list_1[i] == True:
                print i
    print 'pickle end'
Пример #15
0
                   step=10,
                   verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:, rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

## 4) Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"),
                                      max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:, embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

## 5) Tree-based SelectFromModel
# RandomForest is used to calculate feature importance using node impurities in each decision tree; final feature
# importance is calculated as average of all decision trees

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100),
                                      max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
        x_test_trans = sel.transform(x_test)

        vali_auc = np.mean(
            cross_val_score(clf,
                            x_train_trans,
                            y_train,
                            cv=skf,
                            scoring='roc_auc'))

        clf.fit(x_train_trans, y_train)
        predict_result = clf.predict_proba(x_test_trans)[:, 1]
        total_predict += predict_result

        test_auc = roc_auc_score(y_test, predict_result)

        soft_rank = [vali_auc if i == True else 0 for i in sel.get_support()]

        record.append([
            clf.__class__.__name__,
            sum(sel.get_support()), test_auc, times, soft_rank
        ])
        #print(clf.__class__.__name__ +" "+ str(sum(sel.get_support())) +" "+ str(test_auc))
    total_test_auc = roc_auc_score(y_test, total_predict)
    record.append(["merge", 0, total_test_auc, times, []])

df_record2 = pd.DataFrame(record)
df_record2.columns = ['clf', 'FeatureCount', 'AUC', 'Time', 'SoftFeatureRank']
# get mean
df_record2.groupby('clf')['AUC', 'FeatureCount'].agg({
    'AUC': 'mean',
    'FeatureCount': 'mean'
Пример #17
0
y_train = train['target']

feat_labels = X_train.columns

rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

rf.fit(X_train, y_train)

importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

for f in range(X_train.shape[1]):

    print("%2d) %-*s %f" %
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

sfm = SelectFromModel(rf, threshold='median', prefit=True)

print('Number of features before selection: {}'.format(X_train.shape[1]))

n_features = sfm.transform(X_train).shape[1]

print('Number of features after selection: {}'.format(n_features))

selected_vars = list(feat_labels[sfm.get_support()])

train = train[selected_vars + ['target']]
scaler = StandardScaler()

scaler.fit_transform(train.drop(['target'], axis=1))
Пример #18
0
# %% Prepare data for logistic regression
y = df["is_red"]
X = df.drop(["is_red", 'GH', 'GR', 'L', 'O', 'S', 'V'], axis=1)
X.assign(y=y).to_csv(f"../data/2020-10-21_vcf-model.csv")

# %% Calculate variable frequency for plotting
var_freq = X.sum() / len(X)
var_freq.rename("variant_frequency").to_csv(
    f"../data/2020-10-21_variant-freq.csv")

# %% Fit logistic regression
lr = LogisticRegression(penalty="l1", solver="liblinear")
lr.fit(X, y)
model = SelectFromModel(lr, prefit=True)
indices = model.get_support()
colnames = X.columns[indices]
X_new = X.loc[:, indices]
X_new.assign(y=y).to_csv(
    f"../data/2020-10-21_logistic-regression-lasso-selected-features.csv")

coef_df = pd.DataFrame(lr.coef_, columns=X.columns)
ors = coef_df.squeeze().transform("exp")
ors = ors[ors != 1]
ors.sort_values().tail(20)
ors.to_csv(f"../data/2020-10-21_odds-ratios.csv")

# %% Figure 2: Plot ROC curve
prefix = f"2020-10-21_vcf_logistic-regression-model"

suffixes = [
Пример #19
0
def feature_selection(
        context,
        df_artifact,
        k=2,
        min_votes=0.5,
        label_column: str = 'Y',
        stat_filters=[
            'f_classif', 'mutual_info_classif', 'chi2', 'f_regression'
        ],
        model_filters={
            'LinearSVC': 'LinearSVC',
            'LogisticRegression': 'LogisticRegression',
            'ExtraTreesClassifier': 'ExtraTreesClassifier'
        },
        max_scaled_scores=True):
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context
    :param k:                 number of top features to select from each statistical
                              function or model
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes
    :param label_column:      ground-truth (y) labels
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection)
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
    :param max_scaled_scores: produce feature scores table scaled with max_scaler
    """

    # Read input DF
    df_path = str(df_artifact)
    context.logger.info(f'input dataset {df_path}')
    if df_path.endswith('csv'):
        df = pd.read_csv(df_path)
    elif df_path.endswith('parquet') or df_path.endswith('pq'):
        df = pd.read_parquet(df_path)

    # Set feature vector and labels
    y = df.pop(label_column)
    X = df

    # Create selected statistical estimators
    stat_functions_list = {
        stat_name:
        SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k)
        for stat_name in stat_filters
    }
    requires_abs = ['chi2']

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns)
    for stat_name, stat_func in stat_functions_list.items():
        try:
            # Compute statistics
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
        except Exception as e:
            context.logger.info(
                f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(
        all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(
                **current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
        else:
            try:
                current_model = json.loads(model) if isinstance(
                    model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(
                    **current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores',
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf],
                                                 np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(
            key='max_scaled_scores_feature_scores',
            df=normalized_df,
            local_path='max_scaled_scores_feature_scores.parquet',
            format='parquet')

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [
            1 if x in selected_features_agg[test_name] else 0
            for x in X.columns
        ]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count',
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[
        result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')
Пример #20
0
    param_grid=param_grid,
    scoring=make_scorer(roc_auc_score),
    # n_jobs=4,
    iid=False,
    cv=5)

start_time = time.time()
gsearch.fit(X_trn, y_trn)
elapsed_time = time.time() - start_time
print elapsed_time

gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

sfm = SelectFromModel(lr, threshold=0.2)
sfm.fit(trn[use_columns], trn[target])
support = sfm.get_support()

new_use_columns = [c for c, s in zip(use_columns, support) if s]
del_columns = [c for c, s in zip(use_columns, support) if not s]
use_columns = new_use_columns

trn.drop(del_columns, axis=1, inplace=True)
tst.drop(del_columns, axis=1, inplace=True)

# sfm = SelectFromModel(lr, threshold=0.2)
# sfm.fit(trn[all_tfidf_columns], trn[target])
# sfm.fit(trn[all_tfidf_columns], trn[target])
# support = sfm.get_support()

# new_tfidf_columns = [c for c, s in zip(all_tfidf_columns, support) if s]
Пример #21
0
# In[23]:

from sklearn.feature_selection import SelectFromModel
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(clf, threshold=0.01)

# Train the selector
sfm.fit(X_train, y_train)

# In[24]:

selected_features = []
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    selected_features.append(feat_labels[feature_list_index])

data_selected = data[selected_features]
data_selected.head()

# In[25]:

selected_features

# In[29]:

data_selected.set_index('battery_power').to_csv('scale.csv')

# In[26]:
# model = ElasticNet(l1_ratio = 0.5)
# model.fit(features, labels)
# print(list(zip(features, model.coef_.tolist())))

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# TRANSFORMER METHODS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Now we'll grab the transformer code and wave our magic wand to select
# features based on the wisdom of Python
# For LASSO
model = Lasso()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("LASSO Results")
print(list(features[sfm.get_support(indices=True)]))

# For Ridge
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("Ridge Results")
print(list(features[sfm.get_support(indices=True)]))

# For ElasticNet
model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("ElasticNet Results")
print(list(features[sfm.get_support(indices=True)]))
# Train the selector
rF.fit(train,response)


# In[66]:


features = train.columns.tolist()


# In[67]:



model_features=[]
for f_index in rF.get_support(indices=True):
    model_features.append(features[f_index])


# In[68]:



model_features.append( 'SK_ID_CURR')
model_features.append('TARGET')
model_features


# In[69]:

Пример #24
0
# loop through algorithms and append the score into the list model.fit(X_train, y_train)
prediction5 = model5.predict(X_test)
#score = model.score(X_test, y_test)
print("The accuracy score of ensemble  is {:.2%}".format(
    accuracy_score(Y_test, prediction5)))
print(classification_report(Y_test, prediction5))
# %%
#%%
### ENSEMBLE -2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

rfc = RandomForestClassifier(n_estimators=200, random_state=5678)
model_feature_selection = SelectFromModel(rfc)
model_feature_selection.fit(X, y)
model_feature_selection.get_support()
selected_features = X.columns[model_feature_selection.get_support()]
print("Number of selected features: ", len(selected_features))
print("Selected features are: ", list(selected_features))
# %%
## Modifying our test data and splitting
X = X[selected_features]
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1122)
print("Train data shape: X:", X_train.shape, ", Y: ", Y_train.shape)
print("Test data shape: X:", X_test.shape, ", Y: ", Y_test.shape)
#%%
model2_0 = DecisionTreeClassifier(random_state=687)
model2_0.fit(X_train, Y_train)
Пример #25
0
    predictions = predictions
    prec = sklearn.metrics.precision_score(ground_truth, predictions)
    rec = sklearn.metrics.recall_score(ground_truth, predictions)
    f1 = sklearn.metrics.f1_score(ground_truth, predictions)

    print "prec: " + str(prec)
    print "rec: " + str(rec)
    print "f1: " + str(f1)
    
    return f1

# Build linear SVM classifier, l1 regularization to perform implicit feature selection
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, Y_train)
model = SelectFromModel(lsvc, prefit=True)

features_selected = [elem for selected, elem in zip(model.get_support(), data_loader.get_feature_names()) if selected]
print "Feature names:"
print features_selected

Y_pred = lsvc.predict(X_test)

score = custom_scorer(Y_test, Y_pred)


## Conclusions:
#
# prec: 0.714285714286
# rec: 0.111607142857
# f1: 0.19305019305
#
# By L1 regularize feature specific weights, we achieve a feature selection since the weights of some features turn to zero. By inspecting 
Пример #26
0
from sklearn.feature_selection import SelectFromModel
from tensorflow.python.keras.utils import to_categorical

x_train, y_train = Dataloader().getTrain()
x_test, y_test = Dataloader().getTest()

y_train = to_categorical(y_train)
x_train.pop("start_time")
x_train.pop("end_time")
x_test.pop("start_time")
x_test.pop("end_time")

print(x_train.shape)
print(y_train.shape)

clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

# Apply The Full Featured Classifier To The Test Data
clf.fit(x_train, y_train)

sel = SelectFromModel(clf)
sel.fit(x_train, y_train)
selected_feat = x_train.columns[(sel.get_support())]
len(selected_feat)

print(selected_feat)


def getSelectedFeature():
    return selected_feat
Пример #27
0
def split_and_encode_Xy(X,
                        y,
                        encoding='le',
                        feat_scaler=True,
                        tgt_scaler=True,
                        freqs=None,
                        dummy_cols=10,
                        ohe_dates=False,
                        test_size=.25,
                        feat_select=True,
                        shuffle=True,
                        enc_Xy=False,
                        X_test=None,
                        scoring='r2'):
    """
    Splits X, y into train and test sub sets, encode them

    ---

    shuffle: set it to False to preserve items order
    """
    X_train, y_train, y_test = (None, None, None)
    # do not shuffle the data before splitting to respect row order
    if not enc_Xy:
        # check X, y are valid dataframes or numpy arrays...
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, shuffle=shuffle)
    else:
        print()
        print("Encoding full data set 'X' -> 'X_train'")
        X_train = X
        y_train = y

    print("Let's have a look at the first row and output")
    print("X_train\n", X_train.head())
    print("y_train\n", y_train.head())
    print()

    if list(X.select_dtypes(include=["datetime"]).columns):
        print("datetime type found.")
        X_train = lc.get_date_features(X_train, freqs)
        if X_test is not None:
            X_test = lc.get_date_features(X_test, freqs)

        # print(X_train["Month"].head(3))

    if encoding == 'le':
        X_train = lc.dummy_encode(X_train.copy()).astype(np.float32)
        if X_test is not None:
            X_test = lc.dummy_encode(X_test.copy()).astype(np.float32)
    elif encoding == 'ohe':
        # do this for mixed label-onehot encoding !
        # X_train.reset_index(drop=True, inplace=True)
        # X_test.reset_index(drop=True, inplace=True)

        X_train = lc.get_dummies_or_label_encode(X_train.copy(),
                                                 dummy_cols=dummy_cols,
                                                 ohe_dates=ohe_dates).astype(
                                                     np.float32)
        # print("oheencoded X_train['month'] \n", X_train["Month"].head(3))

        if X_test is not None:
            X_test = lc.get_dummies_or_label_encode(
                X_test.copy(), dummy_cols=dummy_cols,
                ohe_dates=ohe_dates).astype(np.float32)

            X_test = eu.reorder_ohencoded_X_test_columns(X_train, X_test)
    else:
        raise ValueError("%r is not a valid value for var 'encoding', \n"
                         "valid values are in ['le', 'ohe']" % encoding)

    print()

    if X_train.isnull().values.any():
        X_train = X_train.fillna(X_train.median())

    if X_test is not None and X_test.isnull().values.any():
        X_test = X_test.fillna(X_test.median())

    print("After encoding, first row and output")
    print("X_train\n", X_train.head())
    print("X_train.columns\n", list(X_train.columns))
    print("y_train\n", y_train.head())
    print()

    scalers = (None, None)
    data_and_scalers = {"scalers": scalers}

    if feat_scaler:

        print("scaling train and test data")

        scaler = StandardScaler()
        # you're going to perform scaling at training time before finalization
        if not enc_Xy:
            X_train_scaled = scaler.fit_transform(X_train)
            X_train = DataFrame(data=X_train_scaled,
                                columns=X_train.columns,
                                index=X_train.index)

            print()
            print("X_train shape:", X_train.shape)
            if X_test is not None:
                X_test_scaled = scaler.transform(X_test)
                X_test = DataFrame(data=X_test_scaled,
                                   columns=X_test.columns,
                                   index=X_test.index)
                print("X_test shape:", X_test.shape)

            print()
            print("After scaling...")
            print("X_train\n", X_train[:1])
            print("X_train type", type(X_train))
            if X_test is not None:
                print("X_test\n", X_test[:1])
                print("X_test type", type(X_test))
            print()

        scalers = (scaler, None)
        data_and_scalers["scalers"] = scalers

    print("scoring:", scoring)
    # tgt_scaler = False if scoring == 'neg_rmsle' else True
    # standard scaling introduces negative values,
    # which can't be fed to log, hence to rmsle

    if tgt_scaler:
        print("Scaling target...")

        if scoring != 'neg_rmsle':
            y_scaler = StandardScaler()
            y_train = y_scaler.fit_transform(y_train.values.reshape(
                -1, 1)).ravel()
        else:
            y_scaler = MinMaxScaler()
            y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))

        print("y_train and its type\n", (y_train[:1], type(y_train)))

        if not enc_Xy:
            if scoring != 'neg_rmsle':
                y_test = y_scaler.transform(y_test.values.reshape(-1,
                                                                  1)).ravel()
            else:
                y_test = y_scaler.fit_transform(y_test.values.reshape(-1, 1))

            print("y_test and its type\n", (y_test[:3], type(y_test)))

        scalers = (scalers[0], y_scaler)
        data_and_scalers["scalers"] = scalers

        print()

    # this works for classifiers
    # featsel_tuple = eu.create_feature_selector(X_train, None, seed)

    if feat_select and X_train.shape[1] > 10:

        lsvr = LinearSVR(max_iter=1e4)
        lsvr = lsvr.set_params(C=0.01,
                               loss="squared_epsilon_insensitive",
                               dual=False)
        # threshold=[1e-2, 1e-1] or in ["mean", "median"]
        thsd = "median"  # "median", "median"
        featselector = SelectFromModel(lsvr, threshold=thsd)
        # tscv_fs = TimeSeriesSplit(n_splits=5)
        # featselector = RFECV(lsvr, step=1, cv=tscv_fs)

        data_and_scalers["f_selector"] = featselector

        if not enc_Xy:
            # featselector = featsel_tuple[1]
            X_train_selected = featselector.fit_transform(X_train, y_train)
            xtr_indices = featselector.get_support()
            X_train = DataFrame(data=X_train_selected,
                                columns=X_train.columns[xtr_indices],
                                index=X_train.index)

            print("After feature selection...")
            print("X_train shape:", X_train.shape)
            if X_test is not None:
                X_test_selected = featselector.transform(X_test)
                xtt_indices = featselector.get_support()
                X_test = DataFrame(data=X_test_selected,
                                   columns=X_test.columns[xtt_indices],
                                   index=X_test.index)

                print("X_test shape:", X_test.shape)

    data_and_scalers["data"] = (X_train, X_test, y_train, y_test)

    return data_and_scalers
Пример #28
0
def select_from_model(df):
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    clf = RandomForestClassifier(random_state=9)
    model = SelectFromModel(clf)
    model.fit_transform(X, y)
    return X.columns.values[model.get_support()].tolist()
class ExtraTreeBasedSelectorRegression(Transformer):
    def __init__(self,
                 n_estimators=100,
                 criterion='mse',
                 min_samples_leaf=1,
                 min_samples_split=2,
                 max_features=1.,
                 bootstrap='False',
                 max_leaf_nodes='None',
                 max_depth='15',
                 min_weight_fraction_leaf=0.,
                 oob_score=False,
                 n_jobs=-1,
                 random_state=1,
                 verbose=0):
        super().__init__("extra_trees_based_selector_regression", 31)
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'

        self.n_estimators = n_estimators
        self.estimator_increment = 10
        if criterion not in ("mse", "friedman_mse", "mae"):
            raise ValueError("'criterion' is not in ('mse', 'friedman_mse', "
                             "'mae'): %s" % criterion)
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_weight_fraction_leaf = min_weight_fraction_leaf

        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def operate(self, input_datanode, target_fields=None, sample_weight=None):
        feature_types = input_datanode.feature_types
        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(feature_types, self.input_type)
        X_new = X[:, target_fields]

        n_fields = len(feature_types)
        irrevalent_fields = list(range(n_fields))
        for field_id in target_fields:
            irrevalent_fields.remove(field_id)

        if self.model is None:
            from sklearn.feature_selection import SelectFromModel
            from sklearn.ensemble import ExtraTreesRegressor
            self.n_estimators = int(self.n_estimators)
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_samples_split = int(self.min_samples_split)
            self.max_features = float(self.max_features)
            self.bootstrap = check_for_bool(self.bootstrap)
            self.n_jobs = int(self.n_jobs)
            self.verbose = int(self.verbose)

            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)

            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))

            estimator = ExtraTreesRegressor(
                n_estimators=self.n_estimators,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state)
            estimator.fit(X_new, y, sample_weight=sample_weight)
            self.model = SelectFromModel(estimator=estimator,
                                         threshold='mean',
                                         prefit=True)

        _X = self.model.transform(X_new)
        is_selected = self.model.get_support()

        irrevalent_types = [feature_types[idx] for idx in irrevalent_fields]
        selected_types = [
            feature_types[idx] for idx in target_fields if is_selected[idx]
        ]
        selected_types.extend(irrevalent_types)

        new_X = np.hstack((_X, X[:, irrevalent_fields]))
        new_feature_types = selected_types
        output_datanode = DataNode((new_X, y), new_feature_types,
                                   input_datanode.task_type)
        output_datanode.trans_hist = input_datanode.trans_hist.copy()
        output_datanode.trans_hist.append(self.type)
        self.target_fields = target_fields.copy()

        return output_datanode

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()

            n_estimators = Constant("n_estimators", 100)
            criterion = CategoricalHyperparameter("criterion",
                                                  ["mse", "friedman_mse"])
            max_features = UniformFloatHyperparameter("max_features",
                                                      0.1,
                                                      1.0,
                                                      default_value=1.0,
                                                      q=0.05)

            max_depth = UnParametrizedHyperparameter(name="max_depth",
                                                     value="15")
            max_leaf_nodes = UnParametrizedHyperparameter(
                "max_leaf_nodes", "None")

            min_samples_split = UniformIntegerHyperparameter(
                "min_samples_split", 2, 20, default_value=2)
            min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf",
                                                            1,
                                                            20,
                                                            default_value=1)
            min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 0.)

            bootstrap = CategoricalHyperparameter("bootstrap",
                                                  ["True", "False"],
                                                  default_value="False")

            cs.add_hyperparameters([
                n_estimators, criterion, max_features, max_depth,
                max_leaf_nodes, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, bootstrap
            ])

            return cs
        elif optimizer == 'tpe':
            from hyperopt import hp
            space = {
                'n_estimators': 100,
                'criterion': hp.choice('etsreg_criterion',
                                       ['gini', 'entropy']),
                'max_features': hp.uniform('etsreg_max_features', 0, 1),
                'max_depth': "15",
                'max_leaf_nodes': "None",
                'min_samples_leaf': hp.randint('etsreg_samples_leaf', 20) + 1,
                'min_samples_split':
                hp.randint('etsreg_samples_split', 19) + 2,
                'min_impurity_decrease': 0.,
                'bootstrap': hp.choice('etsreg_bootstrap', ['True', 'False'])
            }
            return space
Пример #30
0

###--------------------------------------###
###------------COMPARE SCORES------------###
###--------------------------------------###

print('Accuracy without any features, but stars: ' + str(score_nofeatures*100) + '%' )
print('Accuracy after adding basic features: ' + str(score_basicfeatures*100) + '%' )
print('Accuracy after adding readability features: ' + str(score_readable*100) + '%' )
print('Accuracy after adding pos features: ' + str(score_pos*100) + '%' )
print('Accuracy after adding review counts features: ' + str(score_review_count*100) + '%' )
print('Accuracy after adding sentiment features from Vader: ' + str(score_sentiment_vader*100) + '%' )
print('Accuracy after adding review sentiment features from textblob: ' + str(score_sentiment_textblob*100) + '%' )
print('Accuracy after adding rating deviance features: ' + str(score_rating_deviance*100) + '%' )
print('Accuracy after adding concreteness feature: ' + str(score_concreteness*100) + '%' )


# Maybe use sklearn.feature_selection

from sklearn.feature_selection import SelectFromModel

df = df.dropna()
df = df.drop('text', axis =1)
y = df['useful_dummy'].values
X = df.drop('useful_dummy', axis=1).values
# Fit the classifier to the training data
selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
selector.estimator_.coef_
selector.threshold_
selector.get_support()
selector.transform(X)
    data.append(tmp[1:])

import numpy as np
data = np.array(data, dtype=np.float32).T
labels = np.array(labels, dtype=np.float32)

from sklearn.model_selection import train_test_split
data_train, data_test, labels_train, labels_test = train_test_split(
    data, labels, test_size=0.2, stratify=labels)

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=5)
clf = clf.fit(data_train, labels_train)

model = SelectFromModel(clf, threshold=0.01, prefit=True)
data_train = model.transform(data_train)
features = [genes[i] for i in model.get_support(indices=True)]

from scipy.cluster.hierarchy import linkage
Z = linkage(data_train.T, method='single', metric='correlation')

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

plt.xlabel('Gene Symbol')
plt.ylabel('Distance')
dendrogram(Z, labels=features, color_threshold=0.6)
plt.axhline(y=0.6, c='k', linestyle='--')
plt.show()
Пример #32
0
test = odps.get_table('jz_combine_tl_test_6_2').to_df().to_pandas()

predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl']
use_features = [
    t for t in train.columns if t != 'vid' and t not in predict_features
]
x_train = train.loc[:, use_features]
label = train['tl']

gbdt = GradientBoostingRegressor(random_state=1)
rf = RandomForestRegressor(random_state=1)
l2 = RidgeCV()

sfm_gbdt = SelectFromModel(gbdt, threshold=0.001)
sfm_gbdt.fit_transform(x_train, label)
gbdt_features = set(x_train.columns[sfm_gbdt.get_support()])
print('*************************************')
print(gbdt_features)

sfm_rf = SelectFromModel(rf, threshold=0.001)
sfm_rf.fit_transform(x_train, label)
rf_features = set(x_train.columns[sfm_rf.get_support()])
print('*************************************')
print(rf_features)

print(gbdt_features & rf_features)
sfm_l2 = SelectFromModel(l2, threshold=0.5)
sfm_l2.fit_transform(x_train, label)
l2_features = set(x_train.columns[sfm_l2.get_support()])
print('*************************************')
print(l2_features)
    DecisionTreeClassifier(max_depth=10),
    ensemble.RandomForestClassifier(max_depth=10, n_estimators=100),
    ensemble.AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=300),
    ensemble.GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 10 , random_state = 0)]

    scores = np.zeros((6, 6))

    i = 0 

    for clf in classifiers:
        lsvc = clf.fit(X_dummytrain,y_dummytrain)
        model = SelectFromModel(lsvc, prefit = True)

        Train_new = model.transform(X_dummytrain)
        print Train_new.shape
        newindices = model.get_support(True)

        FinalTrainLessFeature = X_dummytrain[np.ix_(np.arange(40000), newindices)]
        FinalTestLessFeature = X_dummytest[np.ix_(np.arange(10000), newindices)]

        print FinalTrainLessFeature.shape
        print FinalTestLessFeature.shape

        j =0

        for clf in classifiers:
            rng = np.random.RandomState(1)
            estimate = clf.fit(FinalTrainLessFeature,y_dummytrain)
            predictions = estimate.predict(FinalTestLessFeature)
            scores[i][j] = accuracy_score(y_dummytest,predictions) 
            print scores
#%%
'''
Creat different combinations of features to give a better training result
'''
poly = PolynomialFeatures()#default = 2
X_train = poly.fit_transform(X_train)
feature_name = poly.get_feature_names(data.columns)#show the combinations
print(feature_name)

'''
Select the above combination of features by run the random forest technique once 
and abandon those low weight arguments.
'''
select = SelectFromModel(RandomForestClassifier(),max_features = 30)#maximum feature to retain is 30
select = select.fit(X_train,Y_train)
featuresSupport = select.get_support()
X_train = select.transform(X_train)
'''
find the selected features' name
'''
selecred_feature_name = []
for i in range(len(featuresSupport)):
    if featuresSupport[i] == True:
        selecred_feature_name.append(feature_name[i])
print(selecred_feature_name)
#%%
'''
Creat many random forest model with different split and leaf size, 
run each forest with some sample data, and leave the best performed one
'''
gs_b = GridSearchCV(

#Performing PCA
pca = PCA(n_components='mle')
pca.fit(train_panel)

#Explained Varinace ratio
NO_cols = len(pca.explained_variance_ratio_[pca.explained_variance_ratio_ > 0.05])
print str(NO_cols)+" columsn have variance ration greater than 0.05"


#Feature selectiong using Lasso
Lass = Lasso(alpha = 0.1)
Lass = Lass.fit(train_panel,train_target)
model_selecting = SelectFromModel(Lass, prefit=True)
features_selected = train_panel.columns[model_selecting.get_support()]
train_features_subset = model_selecting.transform(train_panel)

print str(train_features_subset.shape[1])+" columns selected"

#After Feature Selection

LR_Cross_val = cross_val_score(LR,train_features_subset,train_target,cv=10,scoring = 'mean_squared_error').mean()
print "CV Score for Linear Regression : "+str(-1*LR_Cross_val)


#Tunning Alpha for Lasso 
for i in np.arange(0.01,0.5,0.05) :
    las = Lasso(alpha=i)
    print "Aplha ="+str(i)+"    CV: "+str(-1*cross_val_score(las,train_features_subset,train_target,cv=10,scoring = 'mean_squared_error').mean())
Пример #36
0
r2 = metrics.r2_score(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
print("### LOG LASSO REGRESSION ###")
print("Test Lasso r2-score is {}".format(r2))
print("Test Lasso RMSE is {}".format(rmse))

y_pred = lasso.predict(X_train_chi_sel)
r2 = metrics.r2_score(y_train, y_pred)
rmse = metrics.mean_squared_error(y_train, y_pred, squared=False)
print("Train Lasso r2-score is {}".format(r2))
print("Train Lasso RMSE is {}".format(rmse))

# lasso feature selection
sel = SelectFromModel(lasso)
sel.fit(X_train_chi_sel, y_train)
selected_feat = X_train_chi_sel[:, sel.get_support()]

X_train_selected = sel.transform(X_train_chi_sel)
X_test_selected = sel.transform(test_chi_sel[:, :-1])
print("datasets trasformed to {} features...".format(X_train_selected.shape[1]))

# Random Forest
clf = RandomForestRegressor(random_state=0)
clf.fit(X_train_selected, y_train)

y_pred = clf.predict(X_test_selected)
r2 = metrics.r2_score(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)

print("\nRANDOM FOREST")
print('selected features by lasso: {}'.format(X_train_selected.shape[1]))
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rng = np.random.RandomState(1)
X = rng.randint(0, 2, (200, 20))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

fs_univariate = SelectKBest(k=10)
fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median')

fs_univariate.fit(X, y)
print('Features selected by univariate selection:')
print(fs_univariate.get_support())
plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r')

fs_modelbased.fit(X, y)
print('Features selected by model-based selection:')
print(fs_modelbased.get_support())
plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r');
Пример #38
0
dummies_carbin = pd.get_dummies(test['Cabin'], prefix='Cabin')
dummies_embarked = pd.get_dummies(test['Embarked'], prefix='Embarked')
dummies_sex = pd.get_dummies(test['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(test['Pclass'], prefix='Pclass')
test = pd.concat(
    [test, dummies_carbin, dummies_embarked, dummies_sex, dummies_Pclass],
    axis=1)

from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier, plot_importance

selected_feature = SelectFromModel(estimator=XGBClassifier()).fit(
    train.iloc[:, 2:].values, train.iloc[:, 1])
print(selected_feature)
print(selected_feature.get_support())
train_x = train.iloc[:, 2:]
print(train_x.columns[selected_feature.get_support()])
model_XGB = XGBClassifier()
model_XGB.fit(train.iloc[:, 2:].values, train.iloc[:, 1])

plot_importance(model_XGB)
import matplotlib.pyplot as plt

plt.show()

# test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
#
# test.to_csv("./test0.csv", index=False)
# train.to_csv("./train0.csv", index=False)
Пример #39
0
print "Variance Threshold"
sel = VarianceThreshold(threshold=(0.90 * (1 - 0.90)))
selector=sel.fit(training[features])
print selector.get_support(indices=True)

for i in range(0,len(features)):
    if i in selector.get_support(indices=True):
        print features[i]


print "Select from Model - Logistic"
modelLReg = LogisticRegression()
modelLReg = modelLReg.fit(training[features], training['crime'])
model = SelectFromModel(modelLReg, prefit=True)
print model.get_support(indices=True)

for i in range(0,len(features)):
    if i in model.get_support(indices=True):
        print features[i]


print "Tree Based Feature Selection"
clf = ExtraTreesClassifier()
clf = clf.fit(training[features], training['crime'])
model = SelectFromModel(clf, prefit=True)
print model.get_support(indices=True)

for i in range(0,len(features)):
    if i in model.get_support(indices=True):
        print features[i]
Пример #40
0
X, y = df1.iloc[:, :-1], df1.iloc[:, -1]
print("Input data with columns", X.shape)
print("Predictions shape: ", y.shape)
#%%
print("WITHOUT PREPROCESSING")
test_all(*train_test_split(X, y, test_size=0.25, random_state=2606))
#%%
print("WITH PREPROCESSING")
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # wHAT IS STANDARD SCALER HERE?
X = scaler.fit_transform(X)
test_all(*train_test_split(X, y, test_size=0.25, random_state=2606))

#%%
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
cols = df1.iloc[:, :-1].columns  # Get the column name
sel.fit(df1.iloc[:, :-1], df1.iloc[:, -1])  # X, y
# sel.get_support()
selected_feat = df1[cols].columns[(
    sel.get_support())]  # select only True columns
len(selected_feat)
print(selected_feat)
columnsData = df1[selected_feat]
#%%
print("After preprocessing, and FEATURE SELECTION")
test_all(*train_test_split(columnsData, y, test_size=0.25, random_state=2606))
#%%
def get_esembel_score(name):
    if os.path.exists(util.features_prefix + name + "_XXXYYY.pkl") is False:
        print 'file does not exist'
        exit()
    [X_train, X_validate, X_test, y_train, y_validate, y_test] = pd.read_pickle(
        util.features_prefix + name + '_XXXYYY.pkl')
    import xgboost as xgb

    rf_clf_2 = pd.read_pickle(util.models_prefix + name+'_rf.pkl')
    list_all = []
    rf_2_list = rf_clf_2.predict(X_test)
    from sklearn.feature_selection import SelectFromModel

    model = SelectFromModel(rf_clf_2, prefit=True)
    temp = model.get_support()
    print sum(temp)
    list_all.append(rf_2_list)
    print rf_clf_2.score(X_test, y_test)
    xgb_2 = xgb.Booster({'nthread': 4})  # init model
    xgb_2.load_model(util.models_prefix +name+ '_xgb.pkl')  # load data
    print len(xgb_2.get_fscore().keys())
    dtest = xgb.DMatrix(X_test)
    xgb_2_test = xgb_2.predict(dtest)
    list_all.append(xgb_2_test)
    print score_lists(xgb_2_test, y_test)
    from keras.utils import np_utils
    import copy
    [train_X, train_Y] = pd.read_pickle(util.features_prefix + name + '_XY.pkl')
    X_semantic = np.array(copy.deepcopy(X_test[:, range(95, 475)]))
    X_manual = np.array(copy.deepcopy(X_test[:, range(0, 95)]))
    X_cluster = np.array(copy.deepcopy(X_test[:, range(475, 545)]))
    X_document = np.array(copy.deepcopy(X_test[:, range(545, 547)]))
    X_document[:, [0]] = X_document[:, [0]] + train_X[:, [-1]].max()
    X_semantic = X_semantic.reshape(X_semantic.shape[0], 10, -1)
    X_semantic_1 = np.zeros((X_semantic.shape[0], X_semantic.shape[2], X_semantic.shape[1]))
    for i in range(int(X_semantic.shape[0])):
        X_semantic_1[i] = np.transpose(X_semantic[i])
    json_string = pd.read_pickle(util.models_prefix +name+ '_json_string_cnn.pkl')
    model_cnn = model_from_json(json_string)
    model_cnn.load_weights(util.models_prefix + name+'_nn_weight_cnn.h5')
    cnn_list = model_cnn.predict_classes([X_document, X_cluster, X_manual, X_semantic_1])
    # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
    kk = list(cnn_list)
    list_all.append(kk)
    print score_lists(kk, y_test)
    json_string = pd.read_pickle(util.models_prefix + name + '_json_string_lstm.pkl')
    model_lstm = model_from_json(json_string)
    model_lstm.load_weights(util.models_prefix + name + '_nn_weight_lstm.h5')
    lstm_list = model_lstm.predict_classes([X_document, X_cluster, X_manual, X_semantic_1])
    # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
    kk = list(lstm_list)
    list_all.append(kk)
    print score_lists(kk, y_test)
    list_ensemble = []
    for i in range(len(y_test)):
        dict_all = {}
        for z in range(len(list_all)):
            dict_all[list_all[z][i]] = dict_all.setdefault(list_all[z][i], 0) + 1
            tmp_list = dict_all.items()
        list_ensemble.append(sorted(tmp_list, lambda a, b: -cmp(a[1], b[1]))[0][0])
    print score_lists(list_ensemble, y_test)
    print '**************************'
Пример #42
0
y_train = y_train.reshape((307,))
y_test = y_test.reshape((77,))

X=np.matrix(test_sample.iloc[:,1:201])
y=np.matrix(test_sample[['target']])

from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=199)
embeded_lgb_selector.fit(X, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = test_sample.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

embeded_lgb_feature.remove('target')
embeded_lgb_feature

# importando as bibliotecas dos modelos classificadores
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
Пример #43
0
# -*- coding: utf-8 -*-

import pandas
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel

data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv')

feature = data[['月份', '季度', '广告费用', '客流量']]

lrModel = LinearRegression()

selectFromModel = SelectFromModel(lrModel)

selectFromModel.fit_transform(
    feature, 
    data['销售额']
)

feature.columns[selectFromModel.get_support()]
Пример #44
0
def feature_selection(df, train_y):
    ################
    # 数据预处理,特征选择
    global to_drop
    # 去掉那些变化不大的特征
    variances = VarianceThreshold().fit(df).variances_.tolist()
    drop_variance = []
    for i in range(len(variances)):
        if variances[i] < 0.25:
            drop_variance.append(feature_kinds2[i])
    to_drop['variance'] = drop_variance

    # 计算Pearson相关系数,剔除相关性过高的特征
    coef = df.corr()
    # 提取矩阵的上三角
    upper = coef.where(np.triu(np.ones(coef.shape), k=1).astype(np.bool))
    drop_corr = [
        column for column in upper.columns
        if any(upper[column].abs() > correlation_threshold)
    ]
    to_drop['corr'] = drop_corr

    # 使用LightGBM剔除重要性为0的特征
    features = pd.get_dummies(df)
    feature_names = list(features)
    features = np.array(features)
    labels = np.array(train_y).reshape((-1, ))
    feature_importance_values = np.zeros(len(feature_names))
    for iter in range(n_iterations):
        model = lgb.LGBMClassifier(n_estimators=1000,
                                   learning_rate=0.05,
                                   verbose=-1)
        print("Start lgbm fit ", iter, " times")
        model.fit(features, labels.astype('int'))

        # 记录特征的重要性
        feature_importance_values += model.feature_importances_ / n_iterations

    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance_values
    })

    # 根据重要性对特征进行排序
    feature_importances = feature_importances.sort_values(
        'importance', ascending=False).reset_index(drop=True)

    # 将重要性标准化
    feature_importances['normalized_importance'] = feature_importances[
        'importance'] / feature_importances['importance'].sum()
    feature_importances['cumulative_importance'] = np.cumsum(
        feature_importances['normalized_importance'])

    # 提取重要性为0的特征
    record_zero_importance = feature_importances[
        feature_importances['importance'] == 0.0]

    drop_importance_zero = list(record_zero_importance['feature'])
    to_drop['importance_zero'] = drop_importance_zero
    print("lightgbm finished.")

    # 剔除那些重要性小的特征
    feature_importances = feature_importances.sort_values(
        'cumulative_importance')
    record_low_importance = feature_importances[
        feature_importances['cumulative_importance'] > cumulative_importance]
    drop_importance_low = list(record_low_importance['feature'])
    to_drop['importance_low'] = drop_importance_low

    # L1正则化
    labels = np.array(train_y).reshape((-1, ))
    print("df.shape: ", df.shape)
    print("labels.shape: ", labels.shape)
    clf = LassoCV(max_iter=10000)
    clf.fit(df, labels)
    selector = SelectFromModel(estimator=clf, prefit=True)
    support = selector.get_support()
    print(support)
    drop_feature_lassocv = []
    for iter in range(len(feature_kinds2)):
        if support[iter] is False:
            drop_feature_lassocv.append(feature_kinds2[iter])
    to_drop['L1'] = drop_feature_lassocv

    print(to_drop)
    # 将上述要丢弃的特征整合,去除重复的
    features_to_drop = set(list(chain(*list(to_drop.values()))))
    features_to_drop = list(features_to_drop)

    print("Selected features.")
    return features_to_drop
Пример #45
0
for label, y in (('All', y0), ):
    #*((key, np.where(y0==key, key, 'Other')) for key in np.unique(y0))):
    #%%
    transcripts = []
    scores = []
    trials = 1000
    for seed in tqdm(range(trials)):
        clf = ExtraTreesClassifier(n_estimators=50,
                                   random_state=seed,
                                   max_depth=5,
                                   criterion='entropy',
                                   min_impurity_decrease=0.05)
        clf.fit(X, y)
        dimred = SelectFromModel(clf, prefit=True, max_features=50)

        transcripts.extend(ft.T.index[dimred.get_support()])
        scores.extend(clf.feature_importances_[dimred.get_support()])

    df0 = pd.DataFrame()
    df0['transcripts'] = transcripts
    df0['scores'] = scores

    #%%
    def f(gp):
        return pd.DataFrame([[len(gp), max(gp['scores'])]],
                            columns=['count', 'max'])

    df1 = df0.groupby('transcripts').apply(f).reset_index()
    df1['frequency'] = df1['count'] / trials
    df1['selection'] = (df1['max'] > 0.101) | (df1['frequency'] > 0.06)
    fig1 = px.scatter(df1,
Пример #46
0
forest.fit(X_train, y_train) 
importances = forest.feature_importances_

print('R2 for Train)', forest.score( X_train, y_train ))
print('R2 for Test (cross validation)', forest.score(X_test, y_test))

## Feature Selection
"SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument."





from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(forest, prefit=True, max_features=3)
feature_idx = model.get_support()
feature_names = X.columns[feature_idx]
X_NEW = model.transform(X)
pd.DataFrame(X_NEW, columns= feature_names)


# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_NEW, y, test_size=0.3, random_state=0)
lm = LinearRegression()
lm.fit( X_train, y_train )
print('R2 for Train)', lm.score( X_train, y_train ))
print('R2 for Test (cross validation)', lm.score(X_test, y_test))



from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
import sys
import pandas as pd
import json
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier

if __name__ == "__main__":
    #python selectfeatures.py [input_file] [output_file - new_data.csv]
    if len(sys.argv) == 3:
        df = pd.read_csv(sys.argv[1], sep=',', header=0)
        

        X = df.iloc[:,:-1].as_matrix()
        y = df.iloc[:,-1].as_matrix().astype('U')

        lsvc = ExtraTreesClassifier().fit(X, y)
        model = SelectFromModel(lsvc, prefit=True, threshold=0.01)
        idx_important_features = model.get_support()

        features = df.iloc[:,:-1]

        print(np.where(idx_important_features)[0])

        X_2 = features.iloc[:,idx_important_features]

        df2 = pd.DataFrame(X_2)
        df2['class'] = y

        df2.to_csv(sys.argv[2], sep=',', index=False)
Пример #48
0
def rf_importance(X_train, y_train, threshold):
    clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
    sfm = SelectFromModel(clf, threshold=threshold)
    sfm.fit(X_train, y_train.reshape(-1))
    return sfm.get_support(indices=True)
f.close()
print(len(headers))
train = np.array(train)
train_X = train[:40000, :-1]
norm_X = normalize(train_X, axis = 0, norm = 'max')
train_y = train[:40000, -1]
valid_X = train[40000:, :-1]
norm_vX = normalize(valid_X, axis = 0, norm = 'max')
valid_y = train[40000:, -1]

# Perform logistic regression for feature selection
logreg_feat_sel = LogisticRegression(penalty = 'l1', C = 0.01)
logreg_feat_sel.fit(norm_X, train_y)
model = SelectFromModel(logreg_feat_sel, prefit = True)
X_new = model.transform(norm_X)
X_inds = model.get_support(indices = True)
vX_new = model.transform(norm_vX)

print(X_new.shape)
print(X_inds)
rel_cols = [headers[i] for i in X_inds]
print(rel_cols)

v_score = [0.] * 5
v_score = np.array(v_score)
# Perform logistic regression on the restricted list of features
# Modify penalty term and plot the cross-validation errors of each
for i in range(5):
	restr_logreg = LogisticRegression(penalty = 'l2', C = np.power(10., -i))
	restr_logreg.fit(X_new, train_y)
	logreg_pred_y = restr_logreg.predict(X_new)
Пример #50
0
# Feature selection by selecting K best features
feature_selection_skb = SelectKBest(score_func=f_regression, k=150)
feature_selection_skb.fit(X_train, y_train)
# Get new data set containing only selected features
X_selected_skb = X_selected[X_selected.columns[feature_selection_skb.get_support()]]
# Print feature selection results
print("SelectKBest, selected features:", len(X_selected_skb.columns))
X_train_fs_skb, X_test_fs_skb = train_test_split(X_selected_skb, train_size=0.8, test_size=0.2, shuffle=False)


print("Feature selection - SelectFromModel")
# Feature selection by threshold
feature_selection_sfm = SelectFromModel(ridge_reg_cv, threshold="median")
feature_selection_sfm.fit(X_train, y_train)
# Get new data set containing only selected features
X_selected_sfm = X_selected[X_selected.columns[feature_selection_sfm.get_support()]]
# Print feature selection results
print("SelectFromModel, selected features:", len(X_selected_sfm.columns))
X_train_fs_sfm, X_test_fs_sfm = train_test_split(X_selected_sfm, train_size=0.8, test_size=0.2, shuffle=False)


print("Perform test - basic data")
results = utilities.test_regressions(reg_list, X_train, X_test, y_train, y_test, '',
                                     plot_learning_curves=True, plot_histogram=True, save_path=output_directory)
results_log = utilities.test_regressions(reg_list, X_train, X_test, y_train_log, y_test_log, '_log',
                                         plot_learning_curves=True, plot_histogram=True, save_path=output_directory)
print("Perform test - PCA")
results_pca = utilities.test_regressions(reg_list, X_train_pca, X_test_pca, y_train, y_test, '_pca',
                                         plot_learning_curves=True, plot_histogram=True, save_path=output_directory)
results_pca_log = utilities.test_regressions(reg_list, X_train_pca, X_test_pca, y_train_log, y_test_log, '_pca_log',
                                             plot_learning_curves=True, plot_histogram=True, save_path=output_directory)
Пример #51
0
def feature_selection_embeded(data,
                              label,
                              feature_return='embeded_rf_feature'):
    """
    data: pandas DataFrame,load from train_data.csv
    label: pandas DataFrame, sample city label
    feature_return: optional, feature selected model: 
        ['embeded_rf_feature','embeded_lr_selector', 'embeded_lgb_selector']
    return: feature list
    """
    tmp = pd.merge(label, data, left_index=True, right_index=True)
    X, y = tmp[data.columns].values, tmp["city"].values

    assert feature_return in [
        'embeded_rf_feature', 'embeded_lr_selector', 'embeded_lgb_selector'
    ]
    # feature selected by Random Forest model
    if feature_return == 'embeded_rf_feature':
        embeded_rf_selector = SelectFromModel(RandomForestClassifier(
            criterion='gini',
            max_features='auto',
            random_state=np.random.seed(13),
            n_jobs=-1,
            class_weight='balanced',
            n_estimators=500),
                                              threshold='3.7*mean')
        #1.5
        embeded_rf_selector.fit(X, y)
        embeded_rf_support = embeded_rf_selector.get_support()
        embeded_rf_feature = data.loc[:, embeded_rf_support].columns.tolist()
        print(str(len(embeded_rf_feature)),
              'RandomForestClassifier selected features')
        return embeded_rf_feature
    # feature selected by Logistic Regression model
    elif feature_return == 'embeded_lr_selector':

        embeded_lr_selector = SelectFromModel(LogisticRegression(
            penalty='l1', C=0.6, class_weight='balanced'),
                                              threshold='2*mean')
        #X, y = tmp[embeded_rf_feature].values, tmp["city"].values
        embeded_lr_selector.fit(X, y)
        embeded_lr_selector = embeded_lr_selector.get_support()
        embeded_lr_selector = data.loc[:, embeded_lr_selector].columns.tolist()

        print(str(len(embeded_lr_selector)),
              'LogisticRegression selected features')
        return embeded_lr_selector
    # feature selected by LGBM model
    else:
        lgbc = LGBMClassifier(n_estimators=500,
                              learning_rate=0.05,
                              num_leaves=32,
                              colsample_bytree=0.2,
                              reg_alpha=3,
                              reg_lambda=1,
                              min_split_gain=0.01,
                              min_child_weight=40)

        embeded_lgb_selector = SelectFromModel(lgbc, threshold='mean')
        embeded_lgb_selector.fit(X, y)
        embeded_lgb_support = embeded_lgb_selector.get_support()
        embeded_lgb_feature = data.loc[:, embeded_lgb_support].columns.tolist()
        print(str(len(embeded_lgb_feature)),
              'LGBMClassifier selected features')
        return embeded_lgb_selector
    return None
Пример #52
0
from sklearn.datasets import load_iris

iris = load_iris()
ix, iy = iris.data, iris.target
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

model1 = ExtraTreesClassifier()
model2 = GradientBoostingClassifier()
model1.fit(ix, iy)
model2.fit(ix, iy)
model1.feature_importances_
model2.feature_importances_
clf1 = SelectFromModel(model1, prefit=True)
clf2 = SelectFromModel(model2, prefit=True)
clf1.get_support()
clf2.get_support()

#---
# sklearn 交叉验证
from sklearn.cross_validation import cross_val_score
#cross_val_score(model, X, y, cv=10)
from sklearn.cross_validation import cross_val_predict
#cross_val_predict(model, X, y, cv=10)
from sklearn.cross_validation import LeaveOneOut
#scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X)))

# ---
from sklearn.pipeline import Pipeline, make_pipeline

Пример #53
0
pool.close()
pool.join()
y_data_labels = np.append(y_data_labels, np.zeros(len(neg_data)))
print("Done")

# Concatenate training data
x_data_train = pos_data + neg_data

# Fit + transform data
print("Fitting model: ", end="")
clf.fit_transform(x_data_train, y_data_labels)
print("Done")

# Calculate and print cv scores
scores = cross_validation.cross_val_score(clf, x_data_train, y_data_labels, cv=cv, scoring='f1_weighted')
print("scores: " + str(np.average(scores)))

# Feature importance table
feature_names = vect.get_feature_names()
selected_feature_names = [feature_names[i] for i in sel.get_support(True)]
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
table = [[selected_feature_names[f], importances[f]] for f in indices]

# Pretty-print the table.
df = pd.DataFrame(table)
print(df.to_string(header=False))

# Enter interactive mode
code.interact(local=locals())
Пример #54
0
def get_support(X,y,C):
    lsvc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_support = model.get_support()
    return X_support