def select_features_without_label(features: pd.DataFrame,
                                  missing_threshold=0.99,
                                  correlation_threshold=1.0) -> pd.DataFrame:
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    if correlation_threshold < 1:
        fs.identify_collinear(correlation_threshold)
        return fs.remove(methods=['missing', 'single_unique', "collinear"])
    else:
        return fs.remove(methods=['missing', 'single_unique'])
Пример #2
0
def prepare_data(): 
	"""
	This function is the main of this module. calls the above functions in order to read/clean/save
	our data in usable form.
	I created this function to use dataset_prepare.py as a Python module in our main program.
	
	Return values: training X,Y dataset and testing X,Y dataset
	"""

	# read our csv files
	features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1")
	training_df = pd.read_csv("training.csv").drop("id",axis=1)
	testing_df = pd.read_csv("testing.csv").drop("id",axis=1)

	fs = FeatureSelector(data = training_df)
	fs.identify_collinear(correlation_threshold=0.85)
	training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)
	columnlist = list(training_df)
	testing_df = testing_df[columnlist]
	
	training_df = training_df.sample(frac=1)
	testing_df = testing_df.sample(frac=1)
	train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df)

	training_df = training_df.drop(["attack_cat","label"], axis=1)
	print("The features we will use are: ", np.array(list(training_df)))

	return train_x,train_y,test_x,test_y,labels
Пример #3
0
def transform_to_nominal(): 
    # read our csv files
    training_df = pd.read_csv("training.csv").drop("id",axis=1)
    
    # Feature selector
    fs = FeatureSelector(data = training_df)
    fs.identify_collinear(correlation_threshold=0.85)
    training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)

    training_df = training_df.sample(frac=1)
    training_df = training_df.drop(["attack_cat","label"], axis=1)
    columnList = list(training_df)
    labels,nominal_cols = retrieve_classes(training_df)
    
    return labels,nominal_cols,columnList
Пример #4
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Пример #5
0
def Bestfeature_from_cummulative_importance(inFile, outFile):

    df = pd.read_csv(inFile, sep='\t')
    print(df.shape)
    train_labels = df['class_label']
    train = df.drop(columns=['class_label'])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    zero_importance_features = fs.ops['zero_importance']
    #fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    importance_index = np.min(
        np.where(fs.feature_importances['cumulative_importance'] > 0.99))
    fs.identify_low_importance(cumulative_importance=0.99)
    print(importance_index)
    train_removed_all = fs.remove(methods=['zero_importance'],
                                  keep_one_hot=False)
    train_removed_all = pd.concat([train_removed_all, train_labels], axis=1)
    train_removed_all.to_csv(outFile, sep='\t', index=None)
 def feature_engineering(self, x_data, y_data, train=None):
     #特征选择
     cols = x_data.columns
     # 消耗
     consume_col = cols[0:10]
     # 招募
     recruit_col = cols[10:22]
     # 加速
     acceleration_col = cols[22:32]
     # 建筑
     build_col = cols[32:48]
     # 科技
     science_col = cols[48:97]
     # pvp
     pvp_col = cols[97:103]
     # 付费
     pay_col = cols[103:106]
     # label
     # label_col = cols[108]
     if train:
         fs = FeatureSelector(data=x_data, labels=DataFrame(y_data))
         fs.identify_all(
             selection_params={
                 'missing_threshold': 0.6,
                 'correlation_threshold': 0.98,
                 'task': 'classification',
                 'eval_metric': 'auc',
                 'cumulative_importance': 0.99
             })
         self.drop_columns = fs.ops
         with open('drop_columns.pkl', 'wb') as file:
             pickle.dump(self.drop_columns, file)
         self.feature_df = fs.remove(methods='all', keep_one_hot=False)
     else:
         drop_list = []
         for key in self.drop_columns.keys():
             for value in self.drop_columns[key]:
                 drop_list.append(value)
         self.feature_df.drop(drop_list, axis=1, inplace=True)
     print(self.drop_columns)
Пример #7
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Пример #8
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Пример #9
0
def select_best_features(data_file_path, saveto_path="Default"):

    mod_data_file_path = strip_header(data_file_path)

    if saveto_path == "Default":
        saveto_path = replace_ext(data_file_path, '_reduced.csv')

    X = pd.read_csv(mod_data_file_path)
    y = X['Label']
    X = X.drop(columns=['Label'])

    feature_selector = FeatureSelector(data=X, labels=y)
    feature_selector.identify_single_unique()
    feature_selector.identify_collinear(correlation_threshold=0.98)
    feature_selector.identify_zero_importance(task='classification',
                                              eval_metric='auc',
                                              n_iterations=10,
                                              early_stopping=True)
    features_1hot = feature_selector.one_hot_features
    features_base = feature_selector.base_features
    feature_selector.identify_low_importance(cumulative_importance=0.99)

    X_dash = feature_selector.remove(methods=[
        'single_unique', 'collinear', 'zero_importance', 'low_importance'
    ],
                                     keep_one_hot=False)
    X_dash['Label'] = y

    X_dash.to_csv(saveto_path, index=False)

    meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)]
    with open(saveto_path, 'r') as fh:
        contents = fh.read()
    contents = ','.join(meta_data) + '\n' + contents
    with open(saveto_path, 'w') as fh:
        fh.write(contents)

    os.system("rm -f " + mod_data_file_path)
Пример #10
0
fs.identify_single_unique()
single_unique = fs.ops['single_unique']

fs.identify_collinear(correlation_threshold=0.95)
correlated_features = fs.ops['collinear']

fs.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
                            early_stopping=True)
zero_importance_features = fs.ops['zero_importance']

fs.identify_low_importance(cumulative_importance=0.99)
low_importance_features = fs.ops['low_importance']

X_train = fs.remove(methods='all', keep_one_hot=False)

X_test = X_test.drop(columns=fs.removed_features)
test = test.drop(columns=fs.removed_features)

clf1 = RandomForestClassifier(n_estimators=8000, max_depth=8)
clf1.fit(X_train, np.ravel(y_train))
pred = clf1.predict(X_test)
score = roc_auc_score(y_test, pred)
#print(est, md, score)
final = clf1.predict(test)

final = pd.Series(final)
answer = pd.concat([test1['index'], final], axis=1)
answer.columns = ['index', 'TARGET']
answer.to_csv("submission.csv", index=False)
Пример #11
0
# Feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

# Feature selection (remove highly correlated features)
from feature_selector import FeatureSelector

n = len(X_train.T)
fs = FeatureSelector(data=X_train)
fs.identify_collinear(
    correlation_threshold=0.7)  # select features from training set
corr = fs.ops['collinear']
X_train = fs.remove(methods=['collinear'
                             ])  # remove selected features from training set
to_remove = pd.unique(
    fs.record_collinear['drop_feature'])  # features to remove
X_test = X_test.drop(
    columns=to_remove)  # remove selected features from test set

# Create the artificial neural network
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

num_input_nodes = len(X_train.T)
num_output_nodes = 1
num_hidden_nodes = int(
    (num_input_nodes + num_output_nodes) / 2)  # a typical value
Пример #12
0
Now, we will apply collinear, zero importance, low importance and sigle unique feature importance to selecti which columns to remove
"""

fs.identify_collinear(correlation_threshold=0.975)

fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', 
                            n_iterations = 10, early_stopping = True)

fs.identify_low_importance(cumulative_importance = 0.99)

fs.identify_single_unique()

to_remove = fs.check_removal()

feature_df = fs.remove(
    methods = ['collinear', 'zero_importance', 'low_importance', 'single_unique'],
    keep_one_hot=False
)

feature_df.head()

category_columns = [
                    'protocol_type',
                    'service',
                    'flag'
]

feature_df[category_columns] = feature_df[category_columns].astype('category')

categories = {
    'protocol_type' : feature_df.protocol_type.cat.categories,
    'service' : feature_df.service.cat.categories,
Пример #13
0
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']

# %%
#尋找低貢獻的Feature
#當重要貢獻度的feautures累積超過0.99後,剩下就是低貢獻features
fs.identify_low_importance(cumulative_importance=0.9)
fs.record_low_importance

# %%
#排序找出貢獻高的因子
fs.feature_importances.sort_values(by='cumulative_importance')

# %%
#method可以客製化你想要先去除的
train_removed = fs.remove(methods='all')

# %%
all_to_remove = fs.check_removal()
all_to_remove


# %%
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
Пример #14
0
    train_data.append(df)

# Define name of 12 features set 
file_name = ["AtomPairs2D","AtomPairs2DCount","EState", "Extended", "Fingerprinterd", "GraphOnly",
"KlekotaRoth", "KlekotaRothCount", "MACCS", "Pubchem", "Substructure", "SubstructureCount"]
file_name = sorted(file_name) # Sorting name

#################
#Load one train data for get labels
train_label = pd.read_csv("Data/DILI_data/DILI_train_MF/DILI_train_AtomPairs2D.csv")

# Start feature selecting and add labels for each training dataset
for train, name in zip(train_data, file_name):
    feature_columns = []
    labels = train_label["class."]
    X_train = train.drop(labels = "Name", axis = 1)
    fs = FeatureSelector(data = X_train, labels = labels)
    fs.identify_all(selection_params = {'missing_threshold': 0.8, 'correlation_threshold': 0.98, 
                                        'task': 'classification', 'eval_metric': 'auc', 
                                        'cumulative_importance': 0.99,'num_threads':-1})
    train_removed_all = fs.remove(methods = 'all', keep_one_hot=False) 
    print('Original Number of Features', train.shape[1]) 
    print('Final Number of Features: ', train_removed_all.shape[1]) 
    train_removed_all.head()
    feature_columns.extend(train_removed_all.columns)
    feature_columns = pd.DataFrame(feature_columns,index=None)
    feature_columns.to_csv('Features_'+ name+'.csv',index = False, header = name)
    train_removed_all['class.']=labels
    train_removed_all.to_csv('Data/Feature_Data/Feature_Data/Feature_Train_'+ name + '.csv', index=False, header=True)

Пример #15
0
#feature_selector
from feature_selector import FeatureSelector
fs = FeatureSelector(data=train_data, labels=train_label)

#find features with 0 variance
fs.identify_single_unique()

#recursive feature elimination
fs.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=5,
                            early_stopping=True)
print("finish zero importance analysis")
fs.identify_low_importance(cumulative_importance=0.99)
print("finish low importance analysis")
train_data = fs.remove(methods='all')
print("finish removing train_data")

for col in test_data.columns:
    if col in train_data.columns: continue
    else: test_data = test_data.drop([col], axis=1)

print("done with feature selection!")
print(f"training data: {train_data.shape}")
print(f"testing data: {test_data.shape}")

#lightgbm
lgb_train = lgb.Dataset(train_data, train_label)
lgb_eval = lgb.Dataset(test_data, test_label, reference=lgb_train)
params = {
    'boosting_type': 'gbdt',
Пример #16
0
# need to pass in a cumulative_importance that accounts for that fraction of total feature importance.

fs.identify_low_importance(cumulative_importance=0.99)

low_importance_features = fs.ops['low_importance']
print(low_importance_features[:5])

fs.plot_feature_importances(threshold=0.99, plot_n=12)
plt.show()

# 6   Removing Features

# Removing Features:    This method returns the resulting data which we can then use for machine learning.
#                       The original data will still be accessible in the data attribute of the Feature Selector.

train_no_missing = fs.remove(methods=['missing'])  #以鉴别17种
train_no_missing_zero = fs.remove(methods=['missing',
                                           'zero_importance'])  #已经鉴别66+17=83种

all_to_remove = fs.check_removal()  #检查所有要删除的features
print(all_to_remove[0:])

train_removed = fs.remove(methods='all')  #删除所有的不好的features

# 7   Handling One-Hot Features

train_removed_all = fs.remove(methods='all', keep_one_hot=False)

print('Original Number of Features', train.shape[1])
print('Final Number of Features: ', train_removed_all.shape[1])
correlation_threshold表示特征之间的相关性
task指的是进行的任何,eval_metric表示使用的评价指标
cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95
"""

fs = FeatureSelector(data=x, labels=y)
fs.identify_all(
    selection_params={
        'missing_threshold': 0.6,
        'correlation_threshold': 0.9,
        'task': 'regression',
        'eval_metric': 'mse',
        'cumulative_importance': 0.95
    })

choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'],
                   keep_one_hot=True)

#根据选择得到的特征集来得到训练数据和测试数据集
x = x[choose.columns.values]
X_predict = df_predict[choose.columns.values]
choose.columns

#因为存在样本不均衡问题,因而在选择测试数据集时,将50%为1的样本选做测试集
label_1 = train_data_1['target']
label_0 = train_data_0['target']
train_data_1 = train_data_1[choose.columns.values]
train_data_0 = train_data_0[choose.columns.values]
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(train_data_0,
                                                            label_0,
                                                            test_size=.2,
                                                            random_state=333)
Пример #18
0
#低重要度特征统计
fs.identify_low_importance(cumulative_importance=0.99)
df_low_importance = fs.feature_importances
print(df_low_importance.sort_values('importance', ascending=False).head(20))

#一次行运行所有函数
print('go')
fs.identify_all(
    selection_params={
        'missing_threshold': 0.7,
        'correlation_threshold': 0.99,
        'task': 'classification',
        'eval_metric': tpr_weight_funtion_lc,
        'cumulative_importance': 0.999
    })

#移除特征
# Remove the features from all methods (returns a df)
left_feature, removed_feature = fs.remove(methods=[
    'missing', 'single_unique', 'collinear', 'zero_importance',
    'low_importance'
],
                                          keep_one_hot=True)
print('left_feature\n ', left_feature.columns, left_feature.shape)
print(
    'emoved_feature\n',
    len(removed_feature),
    '\n',
    removed_feature,
)
Пример #19
0
print("# identify_low_importance")
fs.identify_low_importance(cumulative_importance=0.99)
low_importance_features = fs.ops["low_importance"]
with open("low_importance.txt", "w") as f:
    for index, low_importance_feature in enumerate(low_importance_features):
        f.write("特征个数:{}  特征名称:{}\n".format(index + 1, low_importance_feature))
print("#-----------------------------------------#")
print("\n")

print("#-----------------------------------------#")
print("移除上述方法判断出来的不需要特征")
print("输出需要被移除的特征")
feature_remove = fs.check_removal()
for i in feature_remove:
    print("移除特征:{}".format(i))
data_remove_feature = fs.remove(methods="all")
print("原始特征个数:{}".format(data.shape[1]))
print("当前特征个数:{}".format(data_remove_feature.shape[1]))
print("#-----------------------------------------#")
print("\n")

print("#---------------------------------#")
print("剩下特征缺失值使用0来进行填充")
data = data_remove_feature.replace(np.NaN, 0)
if data.isnull().any().any():
    print("数据集中存在数据缺失")
    print(data.shape[0] - data.count())
else:
    print("数据集中不存在参数缺失")
print("#---------------------------------#")
print("\n")
Пример #20
0
#-- Separate features from labels
y = df['target']
train_labels = y
df_feats = df.drop(columns = ['target'])

#-- Create an instance
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])
    single_value = list(fs.ops['single_unique'])
    
    r = set(flatten([missing_features,corelated_features,single_value]))
Пример #21
0
X_train_df = mapper.fit_transform(X_train.copy())
X_test_df = mapper.transform(X_test.copy())

#feature selection.
fs = FeatureSelector(data=X_train_df, labels=y_train)
fs.identify_all(
    selection_params={
        'missing_threshold': 0.6,
        'correlation_threshold': 0.98,
        'task': 'classification',
        'eval_metric': 'auc',
        'cumulative_importance': 0.99
    })
fs.feature_importances.head()

train_removed_all_once = fs.remove(methods='all', keep_one_hot=True)
test_removed_all_once = X_test_df[train_removed_all_once.columns]

#model train pipeline.
xgb_param = {
    'eta': 0.5,
    'silent': 0,
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'gamma': 0.0001,
    'min_child_weight': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'auc',
    'scale_pos_weight': 1,
    'eval_train': 1