def feat_selection(dump_file, feat_type, K_list = []): global word_dic if feat_type == "": return print "begin feature selection..." conn, cur = init_mysql() with open(dump_file) as f: word_dic = cPickle.load(f) vec_type = cPickle.load(f) x = cPickle.load(f) y = cPickle.load(f) idx = cPickle.load(f) x = x.tocsr() y = y.tocsr() idx = idx.tocsr() print "load data complete" train_len = int(0.8*y.shape[0]) train_y = y[:train_len] train_x = x[:train_len] idx_new = None if feat_type == "chi2": print train_x.shape, np.array(train_y.todense()).ravel().shape for K in K_list: fname = dump_file.split(".")[0]+"_"+feat_type+"_"+str(K)+".pkl" if os.path.exists(fname): continue model = SelectKBest(chi2, k=K).fit(train_x, np.array(train_y.todense()).ravel()) x1 = model.transform(x) idx_new = model.get_support(indices=True) s, p = model.get_params()["score_func"](train_x, np.array(train_y.todense()).ravel()) # update word_dic i = 0 word_dic1 = copy.deepcopy(word_dic) for k,v in word_dic1.items(): if v not in idx_new: word_dic1.pop(k) else: word_dic1[k] = i i += 1 # feat_dic = {} # for k in word_dic: # feat_dic[k] = s[word_dic[k]] # sorted_dic = sorted(feat_dic.items(), key=operator.itemgetter(1), reverse=True) # with open("res/feature_extraction_chi2.out", "w") as f: # for k,v in sorted_dic: # # print k, v # f.write(k + "\t" + str(v) + "\n") with open(fname, "wb") as f: cPickle.dump(word_dic1, f) cPickle.dump(vec_type, f) cPickle.dump(x1.tolil(), f) cPickle.dump(y.tolil(), f) cPickle.dump(idx.tolil(), f) elif feat_type == "mi": val = np.zeros(x.shape[1], dtype=float) # fit train_y1 = np.array(train_y.todense()).ravel() for i in range(x.shape[1]): print i mi = normalized_mutual_info_score(np.array(train_x[:,i].todense()).ravel(), train_y1) val[i] = mi for K in K_list: fname = dump_file.split(".")[0]+"_"+feat_type+"_"+str(K)+".pkl" # if os.path.exists(fname): # continue idx_new = (-val).argsort()[:K] # print val[idx_new] # transform x1 = x[:,idx_new] # update word_dic i = 0 word_dic1 = copy.deepcopy(word_dic) for k,v in word_dic1.items(): if v not in idx_new: word_dic1.pop(k) else: word_dic1[k] = i i += 1 feat_dic = {} for k in word_dic: feat_dic[k] = val[word_dic[k]] sorted_dic = sorted(feat_dic.items(), key=operator.itemgetter(1), reverse=True) with open("res/feature_extraction_mi.out", "w") as f: for k,v in sorted_dic: f.write(k + "\t" + str(v) + "\n") # with open(fname, "wb") as f: # cPickle.dump(word_dic1, f) # cPickle.dump(vec_type, f) # cPickle.dump(x1.tolil(), f) # cPickle.dump(y.tolil(), f) # cPickle.dump(idx.tolil(), f) cur.close() conn.close()
y_train = np.load('./trainingsets/edb_2018_11_08/y_train.npy') y_test = np.load('./trainingsets/edb_2018_11_08/y_test.npy') print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # np.save('./trainingsets/edb_2018_11_08/X_train.npy', X_train) # np.save('./trainingsets/edb_2018_11_08/X_test.npy', X_test) # np.save('./trainingsets/edb_2018_11_08/y_train.npy', y_train) # np.save('./trainingsets/edb_2018_11_08/y_test.npy', y_test) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) / 2 + 1 / 2 print(X.shape, y.shape) skb = SelectKBest(f_classif, k=15) X = skb.fit_transform(X, y) print(skb.get_support(), skb.get_params(deep=True)) # Min Max scaling mms = MinMaxScaler() X = mms.fit_transform(X) # ddd = np.concatenate((X, y.reshape([y.shape[0], 1])), axis=1) # print(ddd.shape) # normal = ddd[ddd[:, -1] == -1] # print(normal.shape) X_train = X[:int(X.shape[0] * 0.8)] X_test = X[int(X.shape[0] * 0.8):int(X.shape[0] * 0.9)] X_valid = X[int(X.shape[0] * 0.9):] y_train = y[:int(y.shape[0] * 0.8)] y_test = y[int(X.shape[0] * 0.8):int(X.shape[0] * 0.9)]
class SelectKBest(FeatureSelectionAlgorithm): r"""Implementation of feature selection using selection of k best features according to used score function. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html See Also: * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm` """ Name = 'Select K Best' def __init__(self, **kwargs): r"""Initialize SelectKBest feature selection algorithm. Notes: _params['k'] is initialized to None as it is included in the optimization process later since we cannot determine a proper value range until length of the feature vector becomes known. """ self._params = dict(score_func=ParameterDefinition( [chi2, f_classif, mutual_info_classif]), k=None) self.__k = None self.__select_k_best = SelectKB() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__select_k_best.set_params(**kwargs) def select_features(self, x, y, **kwargs): r"""Perform the feature selection process. Arguments: x (pandas.core.frame.DataFrame): Array of original features. y (pandas.core.series.Series) Expected classifier results. Returns: numpy.ndarray[bool]: Mask of selected features. """ if self.__k is None: self.__k = x.shape[1] self._params['k'] = ParameterDefinition(MinMax(1, self.__k), np.int) val = np.int(np.around(np.random.uniform(1, self.__k))) self.__select_k_best.set_params(k=val) self.__select_k_best.fit(x, y) return self.__select_k_best.get_support() def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureSelectionAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__select_k_best.get_params()))
# of another relevant feature with which it is strongly correlated. # ## Feature Scalling # In[67]: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 skb = SelectKBest(chi2, k=4) skb.fit(X, y) X_new = skb.transform(X) X_new.shape skb.get_params() # In[68]: print(skb.get_params()) # In[69]: feature_selection = skb.get_support(indices=True) # In[70]: feature_selection # In[2]: