Пример #1
1
def variance_threshold(features_train, features_valid):
    """Return the initial dataframes after dropping some features according to variance threshold

    Parameters:
    ----------
    features_train: pd.DataFrame
        features of training set

    features_valid: pd.DataFrame
        features of validation set

    Output:
    ------
    features_train: pd.DataFrame

    features_valid: pd.DataFrame
    """
    from sklearn.feature_selection import VarianceThreshold    

    threshold=0.01
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(features_train)

    ## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier
    variances = selector.variances_
    dropped_features = features_train.columns.values[variances < threshold] #name of features to drop
    features_train.drop(dropped_features, axis=1, inplace=True)
    features_valid.drop(dropped_features, axis=1, inplace=True)

    return features_train, features_valid
Пример #2
0
    def vectorize_EX(self, columns, variance_thresh=0, train_only=False):

        print('Start vectorizing')
        start_time = time.time()
        hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english')

        train_dtm = hasher.fit_transform(
            self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1))
        print(hasher.get_feature_names())
        print('dtm train shape: ', train_dtm.shape)

        selector = VarianceThreshold(variance_thresh)
        train_dtm = selector.fit_transform(train_dtm)
        print('dtm train shape after variance thresh: ', train_dtm.shape)

        if not train_only:
            test_dtm = hasher.transform(
                self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1))

            print('dtm test shape: ', test_dtm.shape)
            test_dtm = selector.transform(test_dtm)
            print('dtm test shape after variance thresh: ', test_dtm.shape)

        print("Time: ", round(((time.time() - start_time)/60), 2))
        print('Complete vectorizing')
        if train_only:
            return train_dtm
        else:
            return (train_dtm, test_dtm)
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100):
    instances_list = instance_dic[word]
    feature_words=feature_dic[word]
    feature_xs = []
    labels = []

    for instance in instances_list:
        label = ' '.join(instance.senseid)
        feature_x_dic = feature_vector(instance,feature_words)
        feature_vals=[]
        for word in feature_words:
            feature_vals.append(feature_x_dic[word])
        feature_xs.append(feature_vals)
        labels.append(label)

    # 1st round feature selection by removing low variance features
    sel_lowvr = VarianceThreshold(threshold=(thre_hold))
    feature_xs_selected = sel_lowvr.fit(feature_xs)
    lowvr_index = feature_xs_selected.get_support(indices=True).tolist()
    feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist()



    # 2nd round feature selection using sklearn's SelectKBest()
    if num_feature < len(feature_xs_selected[0]):
        sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels)
        chi2_index= sel_chi2.get_support(indices=True).tolist()
        #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis
        return lowvr_index, chi2_index
    else:
        print str(word) + ": chi2 selection not executed due to low # of features"
        return lowvr_index, [i for i in range(len(lowvr_index))]
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train',4)
    train_x_new, id = extractID(train_x)
    del train_x
    train_x_clean, contentdict = cityclean(train_x_new)
    del id, train_x_new
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    del train_x_clean
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    del train_x_uniq
    
    #feature selection and modeling
    print "feature selection and modeling"
    exclusivefs(train_x_nor, train_y)
Пример #5
0
def interactive_pipeline(X, Y, pca_n_components, random_forest_n):

    #remove missing values columns

    X.dropna(axis=1,  inplace=True)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    #cutoff by variance
    variance_threshold = 0.03
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)
    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
    X.drop(X.columns[to_drop], 1, inplace=True)
    #random forest
    k_best_features = random_forest_n
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    #PCA
    pca = PCA_Obj(X)
    X = pca.create_pca(pca_n_components)
    print("X.shape", X.shape)
    return X, Y

#feature_selection_pipeline_from_file()
Пример #6
0
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')):
    ''' methods = ('variance', 'correlation', 'l1', 'forest')
        - variance: use variance threshold to discard features that are mostly 0 or 1
        - correlation: use chi2 test to remove most very correlated features
        - l1: use l1 penalty to remove features that make solution sparse
        - forest: use ExtraTreesClassifier to point out importance of features
                    select important ones
    '''
    features = x.loc[:,'Feature_1':'Feature_2']

    if 'variance' in methods:
        vt = VT(threshold=(0.99*(1-0.99)))
        vt.fit(features)
        

    if 'correlation' in methods:
        cr = SP(f_regression, percentile=80)

    if 'l1' in methods:
        rgr = MultiTaskLassoCV(cv=5, n_jobs=-1)
        m = SFM(rgr)
        

    if 'forest' in methods:
        clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y)
        m = SFM(clf)
        m.fit(x.values, y.values)

    for indices in idx_list:
        x_indices = x_indices & indices
    print 'All: %s' % len(x_indices)

    return list(x_indices)
Пример #7
0
    def _variance_threshold(self, input_df, threshold):
        """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        threshold: float
            The variance threshold that removes features that fall under the threshold

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the features that are above the variance threshold

        """

        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        selector = VarianceThreshold(threshold=threshold)
        try:
            selector.fit(training_features)
        except ValueError:
            # None features are above the variance threshold
            return input_df[['guess', 'class', 'group']].copy()

        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
 def doFeatureSelection(self,features,target,k):
     features_int = np.array(features,dtype=float)
     target_int = np.array(target,dtype=float)
     sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
     features_new = sel.fit_transform(features_int)
     #features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int)
     return features_new
Пример #9
0
def variance_cutoff(X,cutoff=0.8):
    """
    Set variance cutoff for variables
    """
    sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff)))
    X = sel.fit_transform(X)
    return X
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)
Пример #11
0
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='Normalize the feature values')
    required = parser.add_argument_group('required options')

    required.add_argument('-x', '--outlist', required=True, help='File containing feature values')
    required.add_argument('-y', '--execlist', required=True, help='File containing exec list')
    
    args = parser.parse_args()

    #X = np.loadtxt(args.outlist, skiprows=1)
    np.set_printoptions(precision=2)
    X = np.genfromtxt(args.outlist, skiprows=1)
    X=np.nan_to_num(X)
    Y = np.loadtxt(args.execlist, ndmin=2)

    #f = open("trainlist","wb")
    #newResult = X/Y
    #sel = VarianceThreshold(threshold=(.8*(1-.8)))
    sel = VarianceThreshold(threshold=(.8*(1-.8)))
    result1 = sel.fit_transform(X)
    newResult = result1/Y
    #result2 = sel.fit_transform(newResult)

    #feature collection for test programs
    if os.path.isfile('eventlist'):
       features = np.genfromtxt('eventlist',dtype='str')
       featureFromVariance = sel.get_support(indices=True)
       text_file = open("variancefeatures.txt","w")
       for i in featureFromVariance:
           text_file.write(features[i])
           text_file.write("\n")
       text_file.close()

    np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
Пример #13
0
def remove_feat_constants(data_frame):
    # Remove feature vectors containing one unique value,
    # because such features do not have predictive value.
    print("")
    print("Deleting zero variance features...")
    # Let's get the zero variance features by fitting VarianceThreshold
    # selector to the data, but let's not transform the data with
    # the selector because it will also transform our Pandas data frame into
    # NumPy array and we would like to keep the Pandas data frame. Therefore,
    # let's delete the zero variance features manually.
    n_features_originally = data_frame.shape[1]
    selector = VarianceThreshold()
    selector.fit(data_frame)
    # Get the indices of zero variance feats
    feat_ix_keep = selector.get_support(indices=True)
    orig_feat_ix = np.arange(data_frame.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
    # Delete zero variance feats from the original pandas data frame
    data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                 axis=1)
    # Print info
    n_features_deleted = feat_ix_delete.size
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame
Пример #14
0
def feature_selection(features, ideal_num=None):
    from sklearn.feature_selection import VarianceThreshold
    copy = np.copy(features)
    for i in range(8):
        sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
        sel.fit_transform(copy[i])
    return copy
Пример #15
0
def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    auto_visualize_features(dataset.drop(subject_number_column, axis = 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]
    names = list(X)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    X.columns = names
    print("p0", X.shape)

    #cutoff by variance
    variance_threshold = 0.05
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

    X.drop(to_drop, axis = 1, inplace=True)
    print("p2",X.shape)


    #random forest
    k_best_features = 42
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X))
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    print("p3", processed_dataframe.shape)
    processed_dataframe.to_csv(processed_dataframe_path)


    #PCA
    pca = PCA_Obj(X)
    pca.explained_variance_graph(pca_explained_variance_graph_path)
    pca.print_components()
    n_components = 12
    X = pca.create_pca(n_components)
    pca.save_pca_data(features_after_pca, Y=Y)
    print("p4", X.shape)
Пример #16
0
 def varianceSelection(self, df, threashold=.8):
     if not isinstance(df, pandas.core.frame.DataFrame):
         logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
                                      datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
         sys.exit(1)
     sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
     sel.fit_transform(df)
     return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def main():
    args = getOptions()
    fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth)))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
#     ftsel = correlationSel()
#     ftsel.dosel(train_x_nor,train_y)
#     train_x_sel = ftsel.transform(train_x_nor)
#     test_x_sel = ftsel.transform(test_x_nor)
    print "modelsing"
    clf = GradientBoostingClassifier(loss='deviance', 
                                     learning_rate=args.lrate,
                                     n_estimators=args.nest,
                                     max_depth=args.maxdepth,
                                     verbose=1)
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
Пример #18
0
def main():
    args = getOptions()
    print args
    fn = "destreeSub.csv"
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
Пример #19
0
def featureSelectionVarianceThreshold(data, probability = 0.8):
    dataRaw = data[:, 2:]
    sel = VarianceThreshold(threshold=(probability*(1 - probability)))
    dataNew = sel.fit_transform(dataRaw)
    fd = open('History.txt','a')
    history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n'
    fd.write(history)
    fd.close()
    return np.c_[data[:, :2], dataNew]
def test_zero_variance():
    """Test VarianceThreshold with default setting, zero variance."""

    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
        sel = VarianceThreshold().fit(X)
        assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))

    assert_raises(ValueError, VarianceThreshold().fit, [0, 1, 2, 3])
    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
def select_centroids(centroids):
    """
    :param centroids: learned centroids
    :return: new_centroids: (without centroids with variance < avg_variance(centroids))
    """
    sel = VarianceThreshold(threshold=np.var(centroids))
    new_centroids = sel.fit_transform(centroids.T)
    new_centroids = new_centroids.T
    return new_centroids
Пример #22
0
 def featureReduction(self, data,threshold_input = 0.99):
     '''
     feature reduction that only keep variables that the variance
     is greater than threshold.
     '''
     selector = VarianceThreshold(threshold = threshold_input)
     data = selector.fit_transform(data)
     print 'Feature Selected with threshold ', threshold_input, data.shape
     return data
    def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False):
        """
         Wrapper for sklearn variance threshold to for pandas dataframe
        :param dframe:
        :param columns:
        :param skip_columns:
        :param thresh:
        :param autoremove:
        :return:
        """
        logging.debug("Finding low-variance features")
        removed_features=[]
        try:
            all_columns = dframe.columns

            # remove the skip columns
            remaining_cols = all_columns.drop(skip_columns)

            # get length of new index.
            max_index = len(remaining_cols) - 1

            skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

            for idx, item in enumerate(skipped_idx):
                if item > max_index:
                    diff = item - max_index
                    skipped_idx[idx] -= diff
                if item == max_index:
                    diff = item - len(skip_columns)
                    skipped_idx[idx] -= diff
                if idx == 0:
                    skipped_idx[idx] = item

            skipped_values = dframe.iloc[:skipped_idx].values

            X = dframe.loc[:, remaining_cols].values

            vt = VarianceThreshold(threshold=thresh)

            vt.fit(X)

            feature_indices = vt.get_support(indices=True)

            feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices]

            removed_features = list(np.setdiff1d(remaining_cols, feature_names))

            logging.debug("Found %d low - variance columns " % len(removed_features))

        except Exception as e:
            logging.error(e)
            logging.error("Could not remove low variance features, some thing went wrong")
            print(e)
            pass

        return dframe, removed_features
Пример #24
0
def test_variance_threshold():
        tpot_obj = TPOT()
        non_feature_columns = ['class', 'group', 'guess']
        training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
        selector = VarianceThreshold(threshold=0)
        selector.fit(training_features)
        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

        assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
def feature_selection_with_scikit():
    """
    1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t
     meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in
     all samples.
    2-Univariate feature selection works by selecting the best features based on univariate statistical tests.
     It can be seen as a preprocessing step to an estimator
    """
    p=0.8
    selector = VarianceThreshold(threshold=(p * (1 - p)))
    c=selector.fit_transform(X)
    print  "Number of the attribute before: ",X.shape[1]
    print "number of the attribute after:",c.shape[1]

    # selecting k best attribute instead of chi2, f_classif can also be used
    skb=SelectKBest(chi2, k=10)
    X_new=skb.fit_transform(X, y)
    attr=np.where(skb._get_support_mask(),attributeNames,'-1')

    print "Best attribute choosen with SelectKBest: "
    i=1
    for att in attr:
        if att!='-1':
            print i, ": ",att
            i+=1

    #using  ExtraTreesClassifier
    print "Using feature importance..."
    etc=ExtraTreesClassifier()
    etc.fit(X,y).transform(X)
    print etc.feature_importances_
    print etc.max_features
    print etc.max_depth

    print "Recursive feature selection : "
    from sklearn.svm import SVC
    import sklearn.linear_model as lm
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Create the RFE object and compute a cross-validated score.
    estim=lm.LinearRegression()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Пример #26
0
def feature_selection(train_instances):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Crossvalidation started... ')
    selector = VarianceThreshold()
    selector.fit(train_instances)
    logger.info('Number of features used... ' +
                str(Counter(selector.get_support())[True]))
    logger.info('Number of features ignored... ' +
                str(Counter(selector.get_support())[False]))
    return selector
Пример #27
0
def feat_selec(tra_val_data, testing_data, thred=0.8):
    """
    Feature selection.
    """
    num_tv = tra_val_data.shape[0]

    total_data = np.vstack((tra_val_data, testing_data))

    selec = VarianceThreshold(threshold=thred)
    total_selected_data = selec.fit_transform(total_data)

    return total_selected_data[:num_tv, :], total_selected_data[num_tv:, :]
 def FeatureSelection( self ):
     """Main feature selection method"""
     
     if 'Variance' in self.FeatureSelectionMethod:
         selector = VarianceThreshold(threshold=0.0001)
         self.Features = selector.fit_transform(self.Features)
 #         pyplot.figure(), pyplot.hist(numpy.var(features, axis = 0), bins = 64), pyplot.show()
     elif 'Trees' in self.FeatureSelectionMethod:
         forestFeatures = ExtraTreesClassifier(n_estimators = 512, random_state = 32)
         forestFeaturesFit = forestFeatures.fit(self.Features, self.Classes)
         featureImportance = 0.001
         featureBool = (forestFeaturesFit.feature_importances_ > featureImportance)
         self.Features = self.Features[:,featureBool]
Пример #29
0
def test_variancethreshold_vs_sklearn():
    trajectories = AlanineDipeptide().get_cached().trajectories
    fs = FeatureSelector(FEATS)

    vt = VarianceThreshold(0.1)
    vtr = VarianceThresholdR(0.1)

    y = fs.partial_transform(trajectories[0])

    z1 = vt.fit_transform([y])[0]
    z_ref1 = vtr.fit_transform(y)

    np.testing.assert_array_almost_equal(z_ref1, z1)
Пример #30
0
def main():
    args = getOptions()
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)

    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
    
    #feature selection
    print "feature selection"
    ftsel = ExtraTreesClassifier()
    ftsel.fit(train_x_nor, train_y)
#     importances = ftsel.feature_importances_
#     indices_test = np.argsort(importances)[::-1]
#     indices_test = indices_test.tolist()
    train_x_trans = ftsel.transform(train_x_nor)
    test_x_trans = ftsel.transform(test_x_nor)
    
    #modelsing
    print "modelsing"
    train = xgb.DMatrix(train_x_trans,label=train_y)
    test = xgb.DMatrix(test_x_trans,label=test_y)
    gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train)
    train_pdt = gbm.predict(train)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = gbm.predict(test)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(test): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open("submission_xgbtrain.csv",'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index])))
    fout.close()
Пример #31
0
# In[28]:


mutual_info=mutual_info_classif(x,y)
mutual_data=pd.Series(mutual_info,index=x.columns)
mutual_data.sort_values(ascending=False)


# # variance thereshold 
# it will remove the valur which threshhold value maching

# In[29]:


from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold(threshold=1)
vt.fit(x)


# In[30]:


x.columns[vt.get_support()]


# In[31]:


zero_therhold=[ i for i in x.columns
               if i not in x.columns[vt.get_support()]]
Пример #32
0
print(len(dup_list))

train_df = train_df.drop(dup_list, axis=1)
print(train_df.shape)
test_df =  test_df.drop(dup_list, axis=1)
print(test_df.shape)

X = train_df.iloc[:, 1:-1].values
y = train_df['TARGET'].values
X_test = test_df.iloc[:, 1:].values
print("X shape", X.shape)
print("X_test shape", X_test.shape)


# remove constant features
selector = VarianceThreshold(threshold = 0.001)
X = selector.fit_transform(X)
X_test = selector.transform(X_test)
print("After removing low variance features")
print("X shape:", X.shape)
print("X_test shape:", X_test.shape)


import xgboost as xgb
from sklearn.ensemble import BaggingClassifier

dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)

evallist  = [(dtrain,'train')]
from sklearn.feature_selection import VarianceThreshold

data = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]

vt = VarianceThreshold()
# 默认过滤掉方差为0特征
result = vt.fit_transform(data)
print(result)
def test_variance_threshold():
    # Test VarianceThreshold with custom variance.
    for X in [data, csr_matrix(data)]:
        X = VarianceThreshold(threshold=.4).fit_transform(X)
        assert (len(data), 1) == X.shape
Пример #35
0
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# Load iris data
iris = datasets.load_iris()

# Create features and target
X = iris.data
y = iris.target

# Create VarianceThreshold object with a variance with a threshold of 0.5
thresholder = VarianceThreshold(threshold=.5)

# Conduct variance thresholding
X_high_variance = thresholder.fit_transform(X)

# View first five rows with features with variances above threshold
X_high_variance[0:5]
Пример #36
0
def removeAtipicalData(X, umbral, exp_min_gen):
    sel = VarianceThreshold(threshold=(umbral * (1 - umbral)))
    sel.fit_transform(X)
    return X[X.columns[sel.get_support(indices=True)]]
Пример #37
0
    def perform_variance_threshold(self, v_threshold):
        selector = VarianceThreshold(v_threshold)
        self.train_x = selector.fit_transform(self.train_x, self.train_y)

        self.test_x = selector.transform(self.test_x)
                        delimiter=',')
train_labels = np.loadtxt('TinyMNIST/trainLabels.csv',
                          dtype=np.int32,
                          delimiter=',')
test_data = np.loadtxt('TinyMNIST/testData.csv',
                       dtype=np.float32,
                       delimiter=',')
test_labels = np.loadtxt('TinyMNIST/testLabels.csv',
                         dtype=np.int32,
                         delimiter=',')
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Feature Selection
tr_samples_size, _ = train_data.shape
all_data = np.vstack((train_data, test_data))
sel = VarianceThreshold(threshold=0.90 * (1 - 0.90))
all_data = sel.fit_transform(all_data)
train_data = all_data[:tr_samples_size]
test_data = all_data[tr_samples_size:]

tr_samples_size, feature_size = train_data.shape
te_samples_size, _ = test_data.shape
print('Train Data Samples:', tr_samples_size, ', Test Data Samples',
      te_samples_size, ', Feature Size(after feature-selection):',
      feature_size)

# In[4]:

types = []
for i in range(10):
    types.append([])
Пример #39
0
 def variance(self, X, threshold):
     from sklearn.feature_selection import VarianceThreshold
     sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
     sel_var = sel.fit_transform(X)
     X = self.X[X.columns[sel.get_support(indices=True)]]
     return X
Пример #40
0
PDXC = PDXC.loc[:, ~PDXC.columns.duplicated()]

GDSCM = pd.read_csv("GDSC_mutations.Paclitaxelv2.tsv",
                    sep="\t",
                    index_col=0,
                    decimal=".")
GDSCM = pd.DataFrame.transpose(GDSCM)

GDSCC = pd.read_csv("GDSC_CNA.Paclitaxelv2.tsv",
                    sep="\t",
                    index_col=0,
                    decimal=".")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

PDXC = PDXC.fillna(0)
PDXC[PDXC != 0.0] = 1
PDXM = PDXM.fillna(0)
PDXM[PDXM != 0.0] = 1
GDSCM = GDSCM.fillna(0)
GDSCM[GDSCM != 0.0] = 1
GDSCC = GDSCC.fillna(0)
GDSCC[GDSCC != 0.0] = 1

ls = GDSCE.columns.intersection(GDSCM.columns)
ls = ls.intersection(GDSCC.columns)
ls = ls.intersection(PDXE.columns)
Пример #41
0
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest, SelectFromModel, RFE
from sklearn.model_selection import cross_val_score as cvs
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt

data = pd.read_csv(r'../data/digit_recognizor.csv')
'''
方差过滤法:过滤掉方差小于一定阈值的特征。和标签无关
此方法对knn,svm,逻辑回归,回归 等需要遍历特征的模型比较使用,主要是用来删除无用特征以减少算法的运行时间。
对随机森林无效,因为随机森林本来就是随机选取部分特征。在sklearn中,单科决策树也是采用随机选择部分特征进行节点的划分。
'''
threshold = 0.1
vt_filter = VarianceThreshold(threshold=threshold)
vt_filter.fit(data)
remove_features = [
    i for i, j in zip(data.columns, vt_filter.variances_) if j <= threshold
]
print(remove_features)
data_new = vt_filter.transform(data.iloc[10000:, :])
'''
统计量过滤法:
chi2: 卡方检验选择特征,只能做离散型。
f_classif: f检验选择分类特征
mutual_info_classif: 互信息选择特征

以上选择特征的量都是用来检验特征与标签之间的相关性强度。
'''
SelectKBest(score_func=chi2, k=400).fit_transform(X=data_new[:, 1:],
Пример #42
0
 def variance_filter(self,x_train,x_test):
     # 经过调参,threshold为6e-5时效果最好
     selector = VarianceThreshold(6e-5)
     # 返回方差过滤后的数据集
     return selector.fit_transform(x_train),selector.transform(x_test)
Пример #43
0
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9800000000000001
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.1),
    BernoulliNB(alpha=1.0, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #44
0
# -*- coding: utf-8 -*-
import sklearn
from sklearn.datasets import load_iris

#导入IRIS数据集
iris = load_iris()

from sklearn.feature_selection import VarianceThreshold
print VarianceThreshold(threshold=3).fit_transform(iris.data)

#特征矩阵
iris.data

#目标向量
print iris.target
import pandas as pd
from sklearn.feature_selection import SelectFwe, VarianceThreshold, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Binarizer, MinMaxScaler
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8285375806358702
exported_pipeline = make_pipeline(
    make_union(
        MinMaxScaler(),
        make_pipeline(SelectFwe(score_func=f_classif, alpha=0.048),
                      Binarizer(threshold=0.05))),
    StackingEstimator(
        estimator=MLPClassifier(alpha=0.01, learning_rate_init=0.001)),
    VarianceThreshold(threshold=0.0001),
    MultinomialNB(alpha=0.1, fit_prior=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #46
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.24),
    ExtraTreesClassifier(criterion="entropy",
                         max_features=0.16,
                         n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Пример #47
0
#Missing values or little variance
    #use minimal variance threshold 
    #normalize the variance before we do feature selection = each value / mean(value) and fit
        #then variance will be lower
    #drop features: if contains lot of missing value
################################################
#Finding a good variance threshold
normalized_df = head_df / np.mean(head_df) #normalized data
normalized_df.boxplot()
plt.show()
print(normalized_df.var()) #variance of normalized data
    #lowest two variance should be removed

#successfully removed the 2 low-variance features
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.001)# Create a VarianceThreshold feature selector
sel.fit(head_df / head_df.mean())# Fit the selector to normalized head_df
mask = sel.get_support()# Create a boolean mask
reduced_df = head_df.loc[:, mask]# Apply the mask to create a reduced dataframe

#Removing features with many missing values
df.isna().sum()#counting missing values
df.isna().sum()/len(df) #ratio of missing value
mask = df.isna().sum/len(df) <0.3
print(mask) #True or False
reduced_df = df.loc[:,mask] # Create a reduced dataset 
reduced_df.head()
################################################
#Pairwise correlation
    #measure strength of the correlation
################################################
Пример #48
0
df = pd.get_dummies(data=df, columns=cols_with_categories)
print("Tamaño antes del conjunto de datos despues de recodificar las variables: {}".format(df.shape))

input("\n--- Pulsar tecla para continuar ---\n")
#https://stackoverflow.com/questions/44867219/pandas-filling-na-values-to-be-filled-based-on-distribution-of-existing-values

# creamos los conjuntos de training y de test
X, y = df[df.columns.difference(['income'])], df['income']
X, y = shuffle(X, y, random_state=SEED)
train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.3, stratify=y)


# creamos la pipeline de preprocesado

preproc = [
    ("var", VarianceThreshold(0.01)),   
    ("standardize", StandardScaler()),      
    ("lasso", SelectFromModel(estimator=LassoCV(tol=0.01))),
]

p = Pipeline(preproc)

x_train_prep = p.fit_transform(train_x, train_y)
print("Descripción de los datos antes y después del preprocesado")
print("Antes: {}".format(train_x.shape))
print("Despues: {}".format(x_train_prep.shape))

input("\n--- Pulsar tecla para continuar ---\n")

# Modelo lineal
Пример #49
0
def variance_threshold_selector(data, threshold = 0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices = True)]]
Пример #50
0
import numpy as np
X=np.arange(0,30).reshape(10,3)#function arange() is the abbreviations of array range witch can be used as range() to set array numbers
# print(X)#we can consider the arry X as a dataset for 10 instances each of witch has 3 features
X[:,1]=-1#change the element in the second column into 1
# print(X)#the second feature for each instance is 1. After the adjustment the variance of feature1, 3 is much bigger than feature2


from sklearn.feature_selection import VarianceThreshold
vt=VarianceThreshold()#create an instance witch can select a feature whose variance is big enougth
Xt=vt.fit_transform(X)
# print(Xt)#the function fit_transform() abandoned feature2
# print(vt.variances_)#.variances_ can calculate the variance for each feature///([ 74.25   0.    74.25])
# Before we analyze the data we should obmit the feature whose variance is 0, otherwise the whole process will be slowdown





Пример #51
0
# Separate dataset for validation
data_submit = data_cl[unknown_mask]

# Separate dataset for training
X = data_cl[~unknown_mask]
Y = target[~unknown_mask]


# ### Variance Threshold
# Find all features with more than 90% variance in values.

# In[ ]:


threshold = 0.90
vt = VarianceThreshold().fit(X)

# Find feature names
feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)]
feat_var_threshold


# ### Top 20 most important features
# According to `RandomForestClassifier`

# In[ ]:


model = RandomForestClassifier()
model.fit(X, Y)
Пример #52
0
Y = wins.copy().reset_index(drop=True)
X = pd.concat([means, stds], axis=1).reset_index(drop=True)

# fill in missing values
X = X.fillna(method="bfill").fillna(method="ffill")

# split the data into training and testing
np.random.seed(1)
test_idx = np.random.choice(a=X.index.values,
                            size=int(X.shape[0] / 5),
                            replace=False)
train_idx = np.array(list(set(X.index.values) - set(test_idx)))

# set up a machine learning pipeline
pipeline = Pipeline([
    ('var', VarianceThreshold()),
    ('scale', MinMaxScaler()),
    # ('model', LassoCV(eps=1e-9, n_alphas=16, n_jobs=-1)),
    # ('model', BayesianRidge()),
    ('model',
     RandomForestRegressor(n_estimators=50,
                           max_depth=8,
                           min_samples_leaf=1,
                           n_jobs=-1,
                           random_state=42)),
    # ('model', MLPRegressor(max_iter=200, hidden_layer_sizes=(128, 128), learning_rate_init=0.001, batch_size=32, activation="relu", solver="adam", learning_rate="adaptive", random_state=42)),
])

# train the model
pipeline.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])
Пример #53
0
def main():
    """Función principal. Ejecuta el proyecto paso a paso.

       NOTA: Por motivos de unificar el código, todos los clasificadores
             considerados son un Pipeline, cuyo último paso es el
             clasificador en sí, con nombre 'clf'."""

    # Inicio de medición de tiempo
    start = default_timer()

    # Ignorar warnings de convergencia
    os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

    # Semilla aleatoria para reproducibilidad
    np.random.seed(SEED)

    # Número de decimales fijo para salida de vectores
    np.set_printoptions(formatter={'float': lambda x: "{:0.3f}".format(x)})

    print(
        "------- PROYECTO FINAL: AJUSTE DE MODELOS DE CLASIFICACIÓN -------\n")

    #
    # LECTURA DE DATOS
    #

    # Cargamos los datos de entrenamiento, validación y test (división 50-20-30)
    print("Leyendo datos de " + DATASET_NAME + "... ", end="", flush=True)
    X, y, attr_names = read_data(PATH + DATASET_NAME)
    X_train, X_val, X_test, y_train, y_val, y_test = \
        split_data(X, y, val_size = 0.2, test_size = 0.3)
    X_train_full = np.vstack((X_train, X_val))
    y_train_full = np.concatenate((y_train, y_val))
    print("Hecho.\n")

    #
    # INSPECCIÓN DE LOS DATOS
    #

    if SHOW != Show.NONE:
        print("--- VISUALIZACIÓN DE LOS DATOS ---\n")

        # Mostramos distribución de clases en training y test
        print("Mostrando gráfica de distribución de clases...")
        vs.plot_class_distribution(y_train_full, y_test, N_CLASSES,
                                   SAVE_FIGURES, IMG_PATH)

        # Visualizamos la importancia de las características según RF
        print("Mostrando gráfica de importancia de características...")
        pipe = Pipeline([("var", VarianceThreshold()),
                         ("std", StandardScaler())])
        X_train_full_pre = pipe.fit_transform(X_train_full)
        rf = RandomForestClassifier(200,
                                    random_state=SEED,
                                    max_depth=20,
                                    n_jobs=-1)
        rf.fit(X_train_full_pre, y_train_full)

        vs.plot_feature_importance(rf.feature_importances_,
                                   n=X_train_full_pre.shape[1],
                                   pca=False,
                                   save_figures=SAVE_FIGURES,
                                   img_path=IMG_PATH)

        # Mostramos gráficas de preprocesado
        print(
            "Mostrando matrices de correlación antes y después de cada preprocesado..."
        )
        preprocess_graphs(X_train_full)

        if SHOW == Show.ALL:
            # Visualizamos el conjunto de entrenamiento en 2 dimensiones
            print(
                "Mostrando proyección del conjunto de entrenamiento en dos dimensiones..."
            )
            vs.plot_tsne(X_train_full, y_train_full, SAVE_FIGURES, IMG_PATH)

    if DO_MODEL_SELECTION:
        clfs = fit_model_selection(X_train, X_val, y_train, y_val)
    else:
        clfs = fit_models(X_train_full, y_train_full)

    #
    # COMPARACIÓN DE MODELOS
    #

    print("--- COMPARACIÓN DE LOS MEJORES MODELOS ---\n")

    compare(clfs, X_train_full, X_test, y_train_full, y_test)

    # Imprimimos tiempo total de ejecución
    elapsed = default_timer() - start
    print("Tiempo total de ejecución: {:.3f} min".format(elapsed / 60.0))
Пример #54
0
data_minmax['V1'] = data_minmax['V1'].apply(lambda x: math.exp(x))
data_minmax['V6'] = data_minmax['V6'].apply(lambda x: math.exp(x))
data_minmax['V30'] = np.log1p(data_minmax['V30'])
X_scaled = pd.DataFrame(preprocessing.scale(data_minmax),
                        columns=data_minmax.columns)
train_x = X_scaled.ix[0:len(df_train) - 1]
test = X_scaled.ix[len(df_train):]
Y = df_train['target']

## feature selection---Through the variance threshold
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

threshold = 0.85
vt = VarianceThreshold().fit(train_x)
feat_var_threshold = train_x.columns[vt.variances_ > threshold *
                                     (1 - threshold)]
train_x = train_x[feat_var_threshold]
test = test[feat_var_threshold]

## single feature---Select features according to the k highest scores.
# see detail -- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
X_scored = SelectKBest(score_func=f_regression, k=10).fit(
    train_x, Y)  # F-value between label/feature for regression tasks.
print(X_scored)
feature_scoring = pd.DataFrame({
    'feature': train_x.columns,
    'score': X_scored.scores_
})
head_feature_num = 18
Пример #55
0
    for elem in dictionary:
        target_vec.append(dictionary[elem]["SIN"])
        del dictionary[elem]["SIN"]

print("Vectorizando...")
wordindex = list([word for word in dictionary])  # guardamos los indices
feat_dict = [dictionary[word] for word in dictionary.keys()]
dv = DictVectorizer(sparse=False)
word_vectors = dv.fit_transform(feat_dict)

print("Normalizando...")
vec_sums = word_vectors.sum(axis=1)
word_vectors = word_vectors / vec_sums[:, numpy.newaxis]

print("Reduciendo dimensionalidad...")
selector = VarianceThreshold(threshold=0.000000001)
new_word_vecs = selector.fit_transform(word_vectors)

#selected = SelectPercentile(chi2, percentile = 10)
#word_vecs_new=selected.fit_transform(new_word_vecs,target_vec)

if mode:
    selected = SelectKBest(chi2, k=800)
else:
    selected = SelectKBest(chi2, k=5000)
word_vecs_new = selected.fit_transform(new_word_vecs, target_vec)

print("Kmeans...")
kmwv = numpy.array(word_vecs_new)
clusters_size = 45  #CANTIDAD DE CLUSTERS A USAR
kmeans = KMeans(clusters_size, max_iter=500, random_state=0).fit(kmwv)
Пример #56
0
# -*- coding: utf-8 -*-
##########################################################################
# Project: COMP6004 - Machine learning pipeline for data analysis
# File: 03-featureExtraction.py
# Author: Diego Bueno - [email protected] 
# Date: 20/04/2021
# Description: Applying feature extraction to step03 of ML pipeline.
#
##########################################################################
# Maintenance                            
# Author: 
# Date:  
# Description: A
#
##########################################################################>
import numpy as np
import pandas as pd
from functions import openfile
from functions import savefile
from functions import convert
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
#calling the function to Load data pre-reading on task 1
print("\nReading the step02 file\n")
db = openfile('data/step02.csv')
print("\nChecking the current shape of the data:")
rows, columns = db.shape
Пример #57
0
def run_main():
    """
        主函数
    """
    if is_first_run:
        # 1. 分割数据集
        print('分割数据集')
        all_gender_age = pd.read_csv(
            os.path.join(dataset_path, gender_age_filename))
        df_train, df_test = split_train_test(all_gender_age)
        # 查看训练集测试集基本信息
        print('训练集中各类的数据个数:', df_train.groupby('group').size())
        print('测试集中各类的数据个数:', df_test.groupby('group').size())

        # 保存分割的数据集
        df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename),
                        index=False)
        df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename),
                       index=False)

    # 2. 加载数据
    print('加载数据')
    # 加载数据
    gender_age_train = pd.read_csv(os.path.join(dataset_path,
                                                train_gender_age_filename),
                                   index_col='device_id')
    gender_age_test = pd.read_csv(os.path.join(dataset_path,
                                               test_gender_age_filename),
                                  index_col='device_id')

    # 选取部分数据用于实验
    percent = 0.1
    gender_age_train = get_part_data(gender_age_train, percent=percent)
    gender_age_test = get_part_data(gender_age_test, percent=percent)

    phone_brand_device_model = pd.read_csv(
        os.path.join(dataset_path, phone_brand_device_model_filename))
    # 去掉重复数据
    phone_brand_device_model = phone_brand_device_model.drop_duplicates(
        'device_id').set_index('device_id')

    events = pd.read_csv(os.path.join(dataset_path, events_filename),
                         usecols=['device_id', 'event_id'],
                         index_col='event_id')
    app_events = pd.read_csv(os.path.join(dataset_path, app_events_filename),
                             usecols=['event_id', 'app_id'])
    # app_labels = pd.read_csv(os.path.join(dataset_path, app_labels_filename))

    # 3. 特征工程
    # 3.1 手机品牌特征
    # 使用LabelEncoder将类别转换为数字
    brand_label_encoder = LabelEncoder()
    brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values)
    phone_brand_device_model['brand_label_code'] = \
        brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values)
    gender_age_train['brand_label_code'] = phone_brand_device_model[
        'brand_label_code']
    gender_age_test['brand_label_code'] = phone_brand_device_model[
        'brand_label_code']

    # 使用OneHotEncoder将数字转换为OneHot码
    brand_onehot_encoder = OneHotEncoder()
    brand_onehot_encoder.fit(
        phone_brand_device_model['brand_label_code'].values.reshape(-1, 1))
    tr_brand_feat = brand_onehot_encoder.transform(
        gender_age_train['brand_label_code'].values.reshape(-1, 1))
    te_brand_feat = brand_onehot_encoder.transform(
        gender_age_test['brand_label_code'].values.reshape(-1, 1))

    print('[手机品牌]特征维度:', tr_brand_feat.shape[1])

    # 3.2 手机型号特征
    # 合并手机品牌与型号字符串
    phone_brand_device_model['brand_model'] = \
        phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model'])

    # 使用LabelEncoder将类别转换为数字
    model_label_encoder = LabelEncoder()
    model_label_encoder.fit(phone_brand_device_model['brand_model'].values)
    phone_brand_device_model['brand_model_label_code'] = \
        model_label_encoder.transform(phone_brand_device_model['brand_model'].values)
    gender_age_train['brand_model_label_code'] = phone_brand_device_model[
        'brand_model_label_code']
    gender_age_test['brand_model_label_code'] = phone_brand_device_model[
        'brand_model_label_code']

    # 使用OneHotEncoder将数字转换为OneHot码
    model_onehot_encoder = OneHotEncoder()
    model_onehot_encoder.fit(
        phone_brand_device_model['brand_model_label_code'].values.reshape(
            -1, 1))
    tr_model_feat = model_onehot_encoder.transform(
        gender_age_train['brand_model_label_code'].values.reshape(-1, 1))
    te_model_feat = model_onehot_encoder.transform(
        gender_age_test['brand_model_label_code'].values.reshape(-1, 1))

    print('[手机型号]特征维度:', tr_model_feat.shape[1])

    # 3.3 安装app特征
    device_app = app_events.merge(events,
                                  how='left',
                                  left_on='event_id',
                                  right_index=True)
    # 运行app的总次数
    n_run_s = device_app['app_id'].groupby(device_app['device_id']).size()

    # 运行app的个数
    n_app_s = device_app['app_id'].groupby(device_app['device_id']).nunique()

    gender_age_train['n_run'] = n_run_s
    gender_age_train['n_app'] = n_app_s

    # 填充缺失数据
    gender_age_train['n_run'].fillna(0, inplace=True)
    gender_age_train['n_app'].fillna(0, inplace=True)

    gender_age_test['n_run'] = n_run_s
    gender_age_test['n_app'] = n_app_s

    # 填充缺失数据
    gender_age_test['n_run'].fillna(0, inplace=True)
    gender_age_test['n_app'].fillna(0, inplace=True)

    tr_run_feat = gender_age_train['n_run'].values.reshape(-1, 1)
    tr_app_feat = gender_age_train['n_app'].values.reshape(-1, 1)

    te_run_feat = gender_age_test['n_run'].values.reshape(-1, 1)
    te_app_feat = gender_age_test['n_app'].values.reshape(-1, 1)

    # 3.4 合并所有特征
    tr_feat = np.hstack((tr_brand_feat.toarray(), tr_model_feat.toarray(),
                         tr_run_feat, tr_app_feat))
    te_feat = np.hstack((te_brand_feat.toarray(), te_model_feat.toarray(),
                         te_run_feat, te_app_feat))
    print('特征提取结束')
    print('每个样本特征维度:', tr_feat.shape[1])

    # 3.5 特征范围归一化
    scaler = StandardScaler()
    tr_feat_scaled = scaler.fit_transform(tr_feat)
    te_feat_scaled = scaler.transform(te_feat)

    # 3.6 特征选择
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled)
    te_feat_scaled_sel = sel.transform(te_feat_scaled)

    # 3.7 PCA降维操作
    pca = PCA(n_components=0.95)  # 保留95%共享率的特征向量
    tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel)
    te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel)
    print('特征处理结束')
    print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1])

    # 4 为数据添加标签
    group_label_encoder = LabelEncoder()
    group_label_encoder.fit(gender_age_train['group'].values)
    y_train = group_label_encoder.transform(gender_age_train['group'].values)
    y_test = group_label_encoder.transform(gender_age_test['group'].values)

    # 5. 训练模型
    # 5.1 逻辑回归模型
    print('训练逻辑回归模型...')
    lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
    lr_model = LogisticRegression()
    best_lr_model = get_best_model(lr_model,
                                   tr_feat_scaled_sel_pca,
                                   y_train,
                                   lr_param_grid,
                                   cv=3)
    y_pred_lr = best_lr_model.predict_proba(te_feat_scaled_sel_pca)

    # 5.2 SVM
    print('训练SVM模型...')
    svm_param_grid = [
        {
            'C': [1e-2, 1e-1, 1, 10, 100],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        },
    ]

    # 设置probability=True用于输出预测概率
    svm_model = svm.SVC(probability=True)
    best_svm_model = get_best_model(svm_model,
                                    tr_feat_scaled_sel_pca,
                                    y_train,
                                    svm_param_grid,
                                    cv=3)
    y_pred_svm = best_svm_model.predict_proba(te_feat_scaled_sel_pca)

    # 6. 查看结果
    print('逻辑回归模型 logloss:', log_loss(y_test, y_pred_lr))
    print('SVM logloss:', log_loss(y_test, y_pred_svm))
"""
# 方差选择法
先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。

使用feature_selection库的VarianceThreshold类来选择特征
代码如下:
"""
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.data[0:5])
""" 
# 方差选择法,返回值为特征选择后的数据 
# 参数threshold为方差的阈值
"""
# fit数据
selector = VarianceThreshold(threshold=3).fit(iris.data, iris.target)
# 转换数据
data = selector.transform(iris.data)
print(data[0:5])
print(selector.variances_)
Пример #59
0
    f"var_thresh={var_thresh}-train_size={train_size}-n_components={n_components}"
    f"-max_iter={max_iter}-with_std={with_std}-seed={seed}")
output_dir = output_dir / global_params

if not os.path.isdir(output_dir):
    print(f"{output_dir} is not a directory... creating.")
    os.mkdir(output_dir)
    os.mkdir(output_dir / "data")
    os.mkdir(output_dir / "models")

#%%
sequencing_df, annotation_df = load_scRNAseq(fillna=True)

#%% throw out some genes with low variance
X = sequencing_df.values.copy()
var_thresh = VarianceThreshold(threshold=var_thresh)
X = var_thresh.fit_transform(X)
gene_index = sequencing_df.columns
original_n_genes = len(gene_index)
gene_index = gene_index[var_thresh.get_support()]
sequencing_df = sequencing_df[gene_index]
new_n_genes = len(gene_index)
print(f"Number of genes removed: {original_n_genes - new_n_genes} "
      f"out of {original_n_genes}")

#%%
np.random.seed(seed)

neuron_index = sequencing_df.index
y = sequencing_df.index.get_level_values(level="Neuron_type").values
Пример #60
0
 def __init__(self):
     #pca_boi = PCA(n_components=8)
     filtering = VarianceThreshold(threshold=4)
     #self.transformer = Pipeline([("first", filtering),("second", pca_boi)])
     #self.transformer = pca_boi
     self.transformer = filtering