Exemplo n.º 1
0
def feat_selection(dump_file, feat_type, K_list = []):
	global word_dic

	if feat_type == "":
		return

	print "begin feature selection..."
	conn, cur = init_mysql()

	with open(dump_file) as f:
		word_dic = cPickle.load(f)
		vec_type = cPickle.load(f)
		x = cPickle.load(f)
		y = cPickle.load(f)
		idx = cPickle.load(f)
		x = x.tocsr()
		y = y.tocsr()
		idx = idx.tocsr()
	print "load data complete"

	train_len = int(0.8*y.shape[0])
	train_y = y[:train_len]
	train_x = x[:train_len]
	idx_new = None

	if feat_type == "chi2":
		print train_x.shape, np.array(train_y.todense()).ravel().shape
		for K in K_list:
			fname = dump_file.split(".")[0]+"_"+feat_type+"_"+str(K)+".pkl"
			if os.path.exists(fname):
				continue

			model = SelectKBest(chi2, k=K).fit(train_x, np.array(train_y.todense()).ravel())
			x1 = model.transform(x)
			idx_new = model.get_support(indices=True)
			s, p = model.get_params()["score_func"](train_x, np.array(train_y.todense()).ravel())
			# update word_dic
			i = 0
			word_dic1 = copy.deepcopy(word_dic)
			for k,v in word_dic1.items():
				if v not in idx_new:
					word_dic1.pop(k)
				else:
					word_dic1[k] = i
					i += 1

			# feat_dic = {}
			# for k in word_dic:
			# 	feat_dic[k] = s[word_dic[k]]

			# sorted_dic = sorted(feat_dic.items(), key=operator.itemgetter(1), reverse=True)
			# with open("res/feature_extraction_chi2.out", "w") as f:
			# 	for k,v in sorted_dic:
			# 		# print k, v
			# 		f.write(k + "\t" + str(v) + "\n")

			with open(fname, "wb") as f:
				cPickle.dump(word_dic1, f)
				cPickle.dump(vec_type, f)
				cPickle.dump(x1.tolil(), f)
				cPickle.dump(y.tolil(), f)
				cPickle.dump(idx.tolil(), f)

	elif feat_type == "mi":
		val = np.zeros(x.shape[1], dtype=float)
		# fit
		train_y1 = np.array(train_y.todense()).ravel()
		for i in range(x.shape[1]):
			print i
			mi = normalized_mutual_info_score(np.array(train_x[:,i].todense()).ravel(), train_y1)
			val[i] = mi
		for K in K_list:
			fname = dump_file.split(".")[0]+"_"+feat_type+"_"+str(K)+".pkl"
			# if os.path.exists(fname):
			# 	continue


			idx_new = (-val).argsort()[:K]
			# print val[idx_new]
			
			# transform
			x1 = x[:,idx_new]

			# update word_dic
			i = 0
			word_dic1 = copy.deepcopy(word_dic)
			for k,v in word_dic1.items():
				if v not in idx_new:
					word_dic1.pop(k)
				else:
					word_dic1[k] = i
					i += 1

			feat_dic = {}
			for k in word_dic:
				feat_dic[k] = val[word_dic[k]]

			sorted_dic = sorted(feat_dic.items(), key=operator.itemgetter(1), reverse=True)
			with open("res/feature_extraction_mi.out", "w") as f:
				for k,v in sorted_dic:
					f.write(k + "\t" + str(v) + "\n")


			# with open(fname, "wb") as f:
			# 	cPickle.dump(word_dic1, f)
			# 	cPickle.dump(vec_type, f)
			# 	cPickle.dump(x1.tolil(), f)
			# 	cPickle.dump(y.tolil(), f)
			# 	cPickle.dump(idx.tolil(), f)

	cur.close()
	conn.close()
Exemplo n.º 2
0
y_train = np.load('./trainingsets/edb_2018_11_08/y_train.npy')
y_test = np.load('./trainingsets/edb_2018_11_08/y_test.npy')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# np.save('./trainingsets/edb_2018_11_08/X_train.npy', X_train)
# np.save('./trainingsets/edb_2018_11_08/X_test.npy', X_test)
# np.save('./trainingsets/edb_2018_11_08/y_train.npy', y_train)
# np.save('./trainingsets/edb_2018_11_08/y_test.npy', y_test)

X = np.concatenate((X_train, X_test))
y = np.concatenate((y_train, y_test)) / 2 + 1 / 2
print(X.shape, y.shape)

skb = SelectKBest(f_classif, k=15)
X = skb.fit_transform(X, y)
print(skb.get_support(), skb.get_params(deep=True))

# Min Max scaling
mms = MinMaxScaler()
X = mms.fit_transform(X)

# ddd = np.concatenate((X, y.reshape([y.shape[0], 1])), axis=1)
# print(ddd.shape)
# normal = ddd[ddd[:, -1] == -1]
# print(normal.shape)

X_train = X[:int(X.shape[0] * 0.8)]
X_test = X[int(X.shape[0] * 0.8):int(X.shape[0] * 0.9)]
X_valid = X[int(X.shape[0] * 0.9):]
y_train = y[:int(y.shape[0] * 0.8)]
y_test = y[int(X.shape[0] * 0.8):int(X.shape[0] * 0.9)]
Exemplo n.º 3
0
class SelectKBest(FeatureSelectionAlgorithm):
    r"""Implementation of feature selection using selection of k best features according to used score function.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
    
    See Also:
        * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm`
    """
    Name = 'Select K Best'

    def __init__(self, **kwargs):
        r"""Initialize SelectKBest feature selection algorithm.

        Notes:
            _params['k'] is initialized to None as it is included in the optimization process later since we cannot determine a proper value range until length of the feature vector becomes known.
        """
        self._params = dict(score_func=ParameterDefinition(
            [chi2, f_classif, mutual_info_classif]),
                            k=None)
        self.__k = None
        self.__select_k_best = SelectKB()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__select_k_best.set_params(**kwargs)

    def select_features(self, x, y, **kwargs):
        r"""Perform the feature selection process.

        Arguments:
            x (pandas.core.frame.DataFrame): Array of original features.
            y (pandas.core.series.Series) Expected classifier results.

        Returns:
            numpy.ndarray[bool]: Mask of selected features.
        """
        if self.__k is None:
            self.__k = x.shape[1]
            self._params['k'] = ParameterDefinition(MinMax(1, self.__k),
                                                    np.int)
            val = np.int(np.around(np.random.uniform(1, self.__k)))
            self.__select_k_best.set_params(k=val)

        self.__select_k_best.fit(x, y)
        return self.__select_k_best.get_support()

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureSelectionAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__select_k_best.get_params()))
Exemplo n.º 4
0
# of another relevant feature with which it is strongly correlated.

# ## Feature Scalling

# In[67]:

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

skb = SelectKBest(chi2, k=4)
skb.fit(X, y)
X_new = skb.transform(X)

X_new.shape

skb.get_params()

# In[68]:

print(skb.get_params())

# In[69]:

feature_selection = skb.get_support(indices=True)

# In[70]:

feature_selection

# In[2]: