def estimate_mutual_info(): read_path = os.path.join(base_path, "data") filelist = os.listdir(read_path) for file in filelist: read_file = os.path.join(read_path, file) if os.path.isdir(read_file): continue dataset = np.loadtxt(read_file) target = dataset[:, -1] # feature scaling and normalization scaler = preprocessing.StandardScaler(copy=False) scaler.fit_transform(dataset[:, :-1]) normalizer = preprocessing.Normalizer(norm='l2', copy=False) normalizer.fit_transform(dataset[:, :-1]) # mutual_info of local features data_local = dataset[:, 10:-1] mi = mutual_info_classif(data_local, target, 'auto', copy='true', n_neighbors=3) write_path = os.path.join(base_path, "mutual_info_local") write_file = os.path.join(write_path, file) np.savetxt(os.path.splitext(write_file)[0] + '_m.txt', mi) # mutual_info of global features data_global = dataset[:, 0:10] mi = mutual_info_classif(data_global, target, 'auto', copy='true', n_neighbors=3) write_path = os.path.join(base_path, "mutual_info_global") write_file = os.path.join(write_path, file) np.savetxt(os.path.splitext(write_file)[0] + '_m.txt', mi)
def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) X = rng.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1]) for n_neighbors in [5, 7, 9]: mi_nn = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0) # Check that the continuous values have an higher MI with greater # n_neighbors assert mi_nn[0] > mi[0] assert mi_nn[1] > mi[1] # The n_neighbors should not have any effect on the discrete value # The MI should be the same assert mi_nn[2] == mi[2]
def MI(x, y, random_state=None): """Get mutual information (MI) between two pandas series. x,y: any numeric or categorical vectors. Return the mutual information between x and y. """ # pandas's category encodes nan as -1 using integer coding is_categorical = lambda x: x.dtype.name == 'category' if is_categorical(x) and is_categorical(y): return mutual_info_classif(x.cat.codes.values.reshape(-1, 1), y.cat.codes, discrete_features=True, random_state=random_state)[0] elif is_categorical(x) and not is_categorical(y): return mutual_info_regression(x.cat.codes.values.reshape(-1, 1), y, discrete_features=True, random_state=random_state)[0] elif not is_categorical(x) and is_categorical(y): return mutual_info_classif(x.values.reshape(-1, 1), y.cat.codes, discrete_features=False, random_state=random_state)[0] else: # both x and y are numeric return mutual_info_regression(x.values.reshape(-1, 1), y, discrete_features=False, random_state=random_state)[0]
def _calculate_mi(self, df, labels, discrete_feature_mask, seed): """Calls the sk-learn implementation of MI and stores results in dict. Args: df: A pd.DataFrame containing feature values where each column corresponds to a feature and each row corresponds to an example. labels: A List where the ith index represents the label for the ith example. discrete_feature_mask: A boolean list where the ith element is true iff the ith feature column in the input df is a categorical feature. seed: An int value to seed the RNG used in MI computation. Returns: Dict[FeatureName, Dict[str,float]] where the keys of the dicts are the feature name and values are a dict where the keys are MUTUAL_INFORMATION_KEY and ADJUSTED_MUTUAL_INFORMATION_KEY and the values are the MI and AMI for that feature. """ result = {} if self._label_feature_is_categorical: mi_per_feature = mutual_info_classif( df.values, labels, discrete_features=discrete_feature_mask, copy=True, random_state=seed) np.random.shuffle(labels) shuffled_mi_per_feature = mutual_info_classif( df.values, labels, discrete_features=discrete_feature_mask, copy=False, random_state=seed) else: mi_per_feature = mutual_info_regression( df.values, labels, discrete_features=discrete_feature_mask, copy=True, random_state=seed) np.random.shuffle(labels) shuffled_mi_per_feature = mutual_info_regression( df.values, labels, discrete_features=discrete_feature_mask, copy=False, random_state=seed) for i, (mi, shuffled_mi) in enumerate( zip(mi_per_feature, shuffled_mi_per_feature)): result[df.columns[i]] = { MUTUAL_INFORMATION_KEY: mi, ADJUSTED_MUTUAL_INFORMATION_KEY: mi - shuffled_mi } return result
def ComputeMIBtwVars(X,Y,rand): if(X.shape[1] > 40): x1 = X[:,0:7] x2 = X[:,7:] s1 = mutual_info_classif(x1,Y,discrete_features=False,random_state=rand) s2 = mutual_info_classif(x2,Y,discrete_features=True,random_state=rand) return np.append(s1,s2) else: return mutual_info_classif(X,Y,discrete_features=False,random_state=rand)
def featureselection(x_positives, x_negatives, y_positives, y_negatives): cv1 = CountVectorizer(stop_words='english', min_df=2, analyzer='word', token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*') x_pos = cv1.fit_transform(x_positives) cv2 = CountVectorizer(stop_words='english', min_df=2, analyzer='word', token_pattern=r'\w\w+') x_neg = cv2.fit_transform(x_negatives) pos_features = dict( zip(cv1.get_feature_names(), mutual_info_classif(x_pos, y_positives, discrete_features=True))) neg_features = dict( zip(cv2.get_feature_names(), mutual_info_classif(x_neg, y_negatives, discrete_features=True))) cv = CountVectorizer(stop_words='english', min_df=2, analyzer='word', token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*') X = cv.fit_transform(x_positives + x_negatives) Y = y_positives + y_negatives feats = dict( zip(cv.get_feature_names(), mutual_info_classif(X, Y, discrete_features=True))) best = sorted(pos_features, key=pos_features.get, reverse=True)[:1000] worst = sorted(neg_features, key=neg_features.get, reverse=True)[:1000] bbest = sorted(feats, key=feats.get, reverse=True)[:1000] print('#' * 50) print('Best good features') print(bbest) print('#' * 50) # print('Best bad features') # print(worst) # print('#'*20) best_cv = CountVectorizer() best_cv.fit([*best, *worst]) # best_cv.fit(best) x = best_cv.transform(x_positives + x_negatives).toarray() print('end of feature selection') print('#' * 20, ) return x, best_cv
def feature_importance_classification(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # F-test f_test = feature_selection.f_classif(cont, target) cont_imp['f_statistic'] = f_test[0] cont_imp['f_p_value'] = f_test[1] # Mutual information mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # Chi²-test chi2_tests = defaultdict(dict) for feature in disc.columns: cont = pd.crosstab(disc[feature], target) statistic, p_value, _, _ = stats.chi2_contingency(cont) chi2_tests[feature]['chi2_statistic'] = statistic chi2_tests[feature]['chi2_p_value'] = p_value chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index') disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic'] disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value'] # Cramér's V (corrected) disc_imp['cramers_v'] = [ cramers_v_corrected_stat(pd.crosstab(feature, target).values) for _, feature in disc.iteritems() ] # Mutual information mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp
def information_gain(self): res = dict( zip(self.x_train.columns.values, mutual_info_classif(self.x_train, self.y_train))) print(res) objects = self.x_train.columns.values y_pos = np.arange(len(objects)) performance = mutual_info_classif(self.x_train, self.y_train) # plt.figure(figsize=(80, 10)) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Information Gain') plt.title('Programming language usage') plt.show()
def main(): pos, neg = getpaths() pos = [*map(lambda file: BeautifulSoup(open(file, encoding='utf-8'), 'lxml'), pos)] neg = [*map(lambda file: BeautifulSoup(open(file, encoding='utf-8'), 'lxml'), neg)] pos_docs = [] for page in pos: _as = page.find_all(f) temp = ' '.join([*map(getrelevantinfo, _as)]) pos_docs.append(temp) neg_docs = [] for page in pos: _as = page.find_all(f) temp = ' '.join([*map(getrelevantinfo, _as)]) neg_docs.append(temp) X = [*pos_docs, *neg_docs] Y = [1 for _ in range(100)] + [0 for _ in range(100)] cv = CountVectorizer(stop_words='english', min_df=2, analyzer='word', token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*') _X = cv.fit_transform(X) selfeat = dict(zip(cv.get_feature_names(), mutual_info_classif(_X, Y, discrete_features=True))) print(sorted(selfeat, reverse=True))
def get_scores_by_adding_selected_features(x_train, y_train, x_test, y_test, order_features, data_set_name, model_name=None, method='shap', max_features=25, plot=True): plot_name = f'{plots_folder}/{method}/{data_set_name}/scores_by_adding_selected_features_on_model_{model_name}.png' title_name = f'{method} data={data_set_name} model={model_name}' predictions, features_in_group = _add_selected_features_iteratively_train_and_predict( x_train, y_train, x_test, order_features, model_name, max_features) # scores = pd.DataFrame([log_loss(y_test, p) for p in predictions], columns=['log_los']) scores = pd.DataFrame() scores['accuracy'] = pd.Series([ np.round(accuracy_score(y_test, p.idxmax(axis=1)), 3) for p in predictions ]).cummax() scores['mutual_info'] = pd.Series([ np.round( mutual_info_classif(x_train[features], y_train, n_neighbors=len(features), random_state=seed).sum(), 3) for features in features_in_group ]).cummax() scores.index.name = 'number of features' if plot: fig = px.line(scores, title=title_name) fig.update_traces(mode="markers+lines") fig.write_image(plot_name) return predictions, features_in_group
def mutual_info(data): # 缺失值填补 df = data.fillna(method = 'ffill') # 将时间变量转换为数值 df['hospitalizeddate'] = ((df['hospitalizeddate'] - datetime.datetime(2008, 12, 31))/ np.timedelta64(1, 'D')).astype(int) df['leavedate'] = ((df['leavedate'] - datetime.datetime(2008, 12, 31)) / np.timedelta64(1, 'D')).astype(int) df_1 = df.copy() # 将字符串变量转化为数值 str_cols = df_1.columns[df_1.dtypes == 'object'] for col in str_cols: df_1[col] = LabelEncoder().fit_transform(df_1[col]) # 计算互信息 y = df_1['label'] x = df_1.drop('label', axis=1) info = mutual_info_classif(x, y, copy=True, random_state=5) map = {} name = x.columns for na, info in zip(name, info): map[na] = info lista = [] for key, value in map.iteritems(): # print([key, value]) if value < 0.05: lista.append(key) print(lista) # 去掉了与y值完全无关的变量 df_2 = df.drop(lista, axis=1) return df_2
def get_features(raw_data, raw_ids, debug, run): """ Calculate the information gain of a dataset. This function takes three parameters: Computes the avg mutual information and uses it as threshold for eliminating features """ # Create a classifier for the data m_info = mutual_info_classif(raw_data, raw_ids) # Get the average of the mutual information of each column avg = np.mean(m_info) # Set aside columns with more info than avg return_columns = [] index = 0 for feature in m_info: if feature >= avg: return_columns.append(index) index += 1 debug += "information gain with avg: " + str(avg) + "\n" debug += "INFORMATION GAIN: Suggesting: " + str( len(return_columns)) + " columns out of " + str(len( raw_data.columns)) + "\n" return return_columns, debug
def mutual_info_categorical(dataframe): n_feature = len(dataframe.columns) - 1 mi = mutual_info_classif( dataframe[dataframe.columns[:n_feature]], np.array(dataframe[dataframe.columns[n_feature:n_feature + 1]]).reshape(-1)) return mi
def FS_IG(X_train, Y_train, X_test): """ Feature selection using FS_IG Args: X (numpy array): aCGH data Y (numpy array): diagnosis data Returns: X_fil: filtered dataframe """ # gets the gains vector gain_vec = mutual_info_classif(X_train, Y_train, discrete_features=True) # gets the indices of columns that can be deleted from the dataset delete_ind = gain_vec.argsort()[::-1][N_FEATURES:] for i in range(len(gain_vec)): if i not in delete_ind: CHOSEN_FEAUTURES.append(i) # deletes the features that can be deleted X_train_fil = np.delete(X_train, delete_ind, 1) X_test_fil = np.delete(X_test, delete_ind, 1) return X_train_fil, X_test_fil
def mutual_info_selection(self, X, y): # Wrapping sklearn mutual info clf enabling parameter config. return mutual_info_classif(X, y, discrete_features=False, n_neighbors=self.num_neighbors, random_state=self.random_state)
def mutual_info_select(self, F, y, threshold): mi = list(enumerate(mutual_info_classif(F, y))) f_best = [] for (ind, rank) in mi: if rank > threshold: f_best.append(ind) return f_best
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() return features.var(axis=0), None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_labels() if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': features_types = self.instances.features.info.types discrete_indexes = [ i for i, t in enumerate(features_types) if t == FeatureType.binary ] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert (False)
def GaIn(data): new_data = "" len = data.shape[1] taille = len / 4 X_train = data.iloc[:, 0:len - 1] Y_train = data.iloc[:, len - 1] Y_train_ohe = pd.DataFrame(OneHotEncoder().fit_transform( Y_train.values.reshape(-1, 1)).toarray()) GI = np.zeros((len - 1, ), dtype='f') #on calclule la moyenne ces coefficients for c in Y_train_ohe: GI += mutual_info_classif(X_train, Y_train_ohe[c]) / Y_train_ohe.shape[1] resultat = pd.DataFrame(GI, index=X_train.columns) resultat = resultat.sort_values(by=0, ascending=False) print("\n Le classement des features en fonction du gain d'information:") print(resultat) nbre_ss_ens = input("Entrez le nombre de features à selectionner: ") if (IsInt(nbre_ss_ens)): taille = int(nbre_ss_ens) new_data = pd.concat([X_train[resultat.index[0:taille]], Y_train], axis=1, sort=False) new_data.to_csv("./workspace/datax/datafeaGI.csv", index=False) print(new_data.head()) return new_data
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() if isinstance(features, spmatrix): variance = mean_variance_axis(features, axis=0)[1] else: variance = features.var(axis=0) return variance, None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_supervision( self.multiclass) if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': if isinstance(features, spmatrix): discrete_indexes = True else: features_types = self.instances.features.info.types discrete_indexes = [ i for i, t in enumerate(features_types) if t == FeatureType.binary ] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert (False)
def getMutualInfo(area1, area2, area3, area4, monkey): feature1 = getAllFeaturesVector(area1) feature2 = getAllFeaturesVector(area2) feature3 = getAllFeaturesVector(area3) feature4 = getAllFeaturesVector(area4) feature_matrix = np.concatenate((feature1, feature2, feature3, feature4)) area1_y = [0] * len(area1) area2_y = [1] * len(area2) area3_y = [2] * len(area3) area4_y = [3] * len(area4) areas = area1_y + area2_y + area3_y + area4_y mi = mutual_info_classif(feature_matrix, areas, discrete_features=False) index1 = np.argmax(mi) old = mi[index1] mi[index1] = -1 index2 = np.argmax(mi) mi[index1] = old plt.scatter(range(len(mi)), mi) plt.xlabel("feature index") plt.ylabel("mutual info") plt.title(monkey) plt.show() print(index1) print(index2)
def get_feature_importances(X, y, importance_method='rf'): # X = X.drop(columns=additional_cols_to_drop) # selector = f_classif(X, y) # selector.fit(X, y) # scores = -np.log10(f_classif(X, y)[0]) # return scores / max(scores) # X = X.drop(columns=additional_cols_to_drop) if importance_method == 'lda': lda = LinearDiscriminantAnalysis() lda.fit(X, y) scores = lda.coef_[0] importances = scores elif importance_method == 'rf': # return scores clf = ExtraTreesClassifier(n_estimators=500) clf.fit(X, y) importances = clf.feature_importances_ elif importance_method == 'mutual_info': importances = feature_selection.mutual_info_classif(X, y, True) # Standardize importances importances = (importances-importances.min()) / (importances.max()-importances.min()) # importances = StandardScaler().fit_transform([importances]) # return pd.DataFrame(list(zip(X.columns, importances)), columns=['name', 'importance'])\ importances = pd.DataFrame({'importance': pd.Series(importances, index=X.columns)}) importances = importances.sort_values('importance', ascending=False) importances.importance_method = importance_method return importances
def _fit_language(self, X_unmapped: Sequence[str], X: Sequence[str], Y: np.ndarray): cv = CountVectorizer( max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, self.hyperparams.max_ngram), max_features=(self.hyperparams.max_vocab * 18), token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+') X_vec = cv.fit_transform(trivial_generator(X)) local_vocab = set() for feat in Y.columns: res = zip( cv.get_feature_names(), mutual_info_classif(X_vec, Y[feat], discrete_features=True)) local_vocab.update(res) self.vocab = { i[0] for i in sorted(local_vocab, key=lambda i: i[1], reverse=True) [:self.hyperparams.max_vocab] } self._analyzer = cv.build_analyzer()
def get_features(raw_data, raw_ids): """ Calculate the information gain of a dataset. This function takes three parameters: 1. data = The dataset for whose feature the IG should be calculated 2. split_attribute_name = the name of the feature for which the information gain should be calculated 3. target_name = the name of the target feature. The default for this example is "class" """ df = pd.DataFrame(raw_data) df["person"] = raw_ids return_columns = [] cv = CountVectorizer(max_df=1, min_df=1, max_features=72, stop_words='english') for column in df: if column != "person": X = df[column].astype(str) Y = df["person"].astype(str) X_vec = cv.fit_transform(X) ig = mutual_info_classif(X_vec, Y, discrete_features=True) avg = sum(ig) if avg > .5 and column != "person": return_columns.append(column) return return_columns
def filter_methods_classification(X, y, feat_names, rotation=False): angle = 0 if rotation: angle = 90 # do calculations f_test, _ = f_classif(X, y) f_test /= np.max(f_test) mi = mutual_info_classif(X, y) mi /= np.max(mi) # do some plotting plt.figure(figsize=(20, 4)) plt.subplot(1, 2, 1) plt.bar(range(X.shape[1]), f_test, align="center") plt.xticks(range(X.shape[1]), feat_names, rotation=angle) plt.xlabel('features') plt.ylabel('Ranking') plt.title('$F-test$ score') plt.subplot(1, 2, 2) plt.bar(range(X.shape[1]), mi, align="center") plt.xticks(range(X.shape[1]), feat_names, rotation=angle) plt.xlabel('features') plt.ylabel('Ranking') plt.title('Mutual information score') plt.show()
def calculate_domain_mutual_info_scores(X_domain_matrix, y_labels): domain_mutual_info_scores = mutual_info_classif(X_domain_matrix, y_labels,random_state=8) domain_mutual_info_scores = dict(zip(X_domain_matrix.columns, domain_mutual_info_scores)) sorted_domain_mutual_info_scores = sorted(domain_mutual_info_scores.items(), key=operator.itemgetter(1)) sorted_domain_mutual_info_scores = pd.DataFrame(sorted_domain_mutual_info_scores) sorted_domain_mutual_info_scores.columns = ['domainID', 'MI-score'] return(sorted_domain_mutual_info_scores)
def mutual_info(X, y): print('mutual information') features_names = X.columns.values.tolist() mi_score = mutual_info_classif(X, y) wscores = zip(features_names, mi_score) wmi = sorted(wscores, key=lambda x: x[1], reverse=True) print(wmi[:14])
def mutual_info(matrix, window=50): n = np.shape(matrix)[1] MImat = np.zeros((n, n)) periods = [] for i in range(0, int(np.floor(np.shape(matrix)[0] / window) - 1)): periods.append([i * window, (i + 1) * window]) for i in range(0, n): for j in range(0, n): MI = [] for p in periods: # print("i={}, j={},p={}".format(i,j,p)) if i != j: info = mutual_info_classif(matrix[p[0]:p[1], i].reshape( -1, 1), matrix[p[0]:p[1], j], n_neighbors=5, discrete_features=True) * 1000 MI.append(info) else: MI.append(0) # print("{},{}: {}".format(i,j,MI)) MImat[i, j] = (np.quantile(MI, .75)) return (MImat)
def rmi(self, X_train, X_test, y_train, feat_names, min_rmi=None, retain_ratio=None, **kwargs): top_n = int(retain_ratio * len(feat_names)) if y_train.dtype != int: le = LabelEncoder() y_train = le.fit_transform(y_train).astype(int) entropy = self.discrete_entropy(y_train) rmi = np.array([ i / entropy for i in list( feature_selection.mutual_info_classif( X_train, y_train, random_state=0)) ]) sorted_rmi_i = np.argsort(rmi)[::-1] rmi = rmi[sorted_rmi_i] feat_names_sorted = np.array(feat_names)[sorted_rmi_i].tolist() indexes_to_retain = np.argwhere( rmi >= min_rmi).flatten() if min_rmi is not None else range(top_n) which_features = [feat_names_sorted[i] for i in indexes_to_retain] df = X_train.copy() print("### RMI ###") for i, feat in enumerate(feat_names_sorted): print(feat, rmi[i]) return df[which_features]
def fit(self, X_unmapped, X, Y, max_vocab=18000, max_features_to_test=180000, window=8, dims=32, max_ngram=5): cv = CountVectorizer( max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, max_ngram), max_features=max_features_to_test, token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+') X_vec = cv.fit_transform(self._smiles_to_trivial_lang(X)) local_vocab = set() for feat in Y.columns: res = zip(cv.get_feature_names(), mutual_info_classif( X_vec, Y[feat], discrete_features=True) ) local_vocab.update(res) self.vocab = {i[0] for i in sorted( local_vocab, key=lambda i: i[1], reverse=True)[:max_vocab]} self._analyzer = cv.build_analyzer() generator = self._make_iterator(X_unmapped, training=True) document_model = Doc2Vec( vector_size=dims, workers=cpu_count(), window=window) document_model.build_vocab(generator) document_model.train( generator, total_examples=len(X_unmapped), epochs=36) self.document_model = document_model
def select_feature(x_train, x_test, y_train): """ This function reduces the number of features from the existing g.t 10,000 to something manageable. Based on experience with feature selection in homework 1, we do not expect the selection to result in improved performance. But we expect a reduction in run-time. No feature Run Time GPA : 320.58s Grit : 280.71 Hardship : 288.05 layoff : 37.22 Note : Code taken as is from homework 1 submission """ # feature selction-mutual info MIC = [] # Mutual info criteria MIC = feature_selection.mutual_info_classif(x_train, y_train) # get most descriptive features (here called good features) good_features = [] for k in range(len(MIC)): if MIC[k] > 0.1: # Criteria for deciding that feature should be included good_features.append(k) # Adapt the training and testing matrices to good features x_train = x_train[:, good_features] x_test = x_test[:, good_features] print(len(good_features)) return x_train, x_test
def feature_selection(self,mode='F'): print 'Feature Selection...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') X=self.train.copy() y=self.train_label['label'].values.copy() test=self.test.copy() if mode.upper()=='M': mi=mutual_info_classif(train.values,train_label['label'].values) elif mode.upper()=='F': F,pval=f_classif(train.values,train_label['label'].values) elif mode.upper()=='C': chi,pval=chi2(train.values,train_label['label'].values) features=self.train.columns.copy() fs_features=features.copy().tolist() if mode.upper()=='M': fs_V=mi.copy().tolist() elif mode.upper()=='F': fs_V=F.copy().tolist() elif mode.upper()=='C': fs_V=chi.copy().tolist() if mode.upper()=='M': selector=SelectPercentile(mutual_info_classif,percentile=80) elif mode.upper()=='F': selector=SelectPercentile(f_classif,percentile=80) elif mode.upper()=='C': selector=SelectPercentile(chi2,percentile=80) X_new=selector.fit_transform(X,y) selected=selector.get_support() for i in xrange(len(features)): if selected[i]==False: t=features[i] fs_features.remove(t) fs_V=np.array(fs_V) fs_features=np.array(fs_features) self.train=pd.DataFrame(X_new,columns=fs_features.tolist()) self.test=test[fs_features] self.fs_features=fs_features feas=pd.DataFrame() feas['feature']=fs_features print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return X_new,feas
def select_sized_features(feature_size,fearure_vecotr,feature_indecies,y,feature_selection_measure): if feature_selection_measure == SelectionMeasure.chi_2: feature_values,p_value = chi2(fearure_vecotr,y) elif feature_selection_measure == SelectionMeasure.f: feature_values,p_value = f_classif(fearure_vecotr,y) else: # elif feature_selection_measure == SelectionMeasure.mutual_info: feature_values = mutual_info_classif(fearure_vecotr,y) feature_value_id_map = {} for i in range(len(feature_values)): feature_value_id_map[ feature_indecies[i] ] = feature_values[i] sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True) selected_features = [] for i in range(feature_size): if i >= len(sorted_features): continue selected_features.append( sorted_features[i][0] ) return selected_features
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("feature_data_dir") parser.add_argument('--method','-m',type=int,default=1,choices=range(5), help= """choose methods from: 0:linear_svc 1:logistic regression 2:naive bayes 3:decision tree 4:random forest """) parser.add_argument('--feature_selection_measure','-fm',type=int,default=0,choices=list(map(int, SelectionMeasure)), help= """choose feature selection measure from: 0:chi2 1:f_classif 2:mutual_info_classif """) parser.add_argument("--use_stanford_type","-us",action='store_true', help = """When specified, the type information of Stanford NER is used as features """ ) parser.add_argument("--use_category","-uc",action='store_true', help = """When specified, the category information from wikidata is used as features """ ) parser.add_argument("--no_words","-nw",action='store_true', help = """When specified, no word features will be used """ ) parser.add_argument("--set_feature_size","-sf",type=int) args=parser.parse_args() feature_data = load_data_set(args.feature_data_dir) labels = [] for single_data in feature_data: labels.append(single_data["judgement"]) # chi2_values, pval = chi2(word_vector,label) # # print "There are %d chi values" %(len(chi2_values)) # feature_value_id_map = {} # for i in range(len(chi2_values)): # feature_value_id_map[ i] = chi2_values[i] # sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True) # print "There are %d sorted_features" %(len(sorted_features)) # print sorted_features if args.set_feature_size is not None: feature_size_vector = [args.set_feature_size] else: feature_size_vector = [i*100 for i in range(1,40)] best_f1 = -1 best_size = 0 recall_atm = 0 precision_atm = 0 args.feature_selection_measure = SelectionMeasure(args.feature_selection_measure) for feature_size in feature_size_vector: print "For size %d" %(feature_size) clf = get_classifier(args.method) f1_vector = [] skf = StratifiedKFold(labels,n_folds=5,shuffle=True) for train, test in skf: test_X = [] test_y = [] #select word features sub_feature_data = [] for i in train: sub_feature_data.append(feature_data[i]) train_y,sub_word_indecies, sub_categories,sub_word_vector = prepare_data(sub_feature_data) if args.feature_selection_measure == SelectionMeasure.chi_2: feature_values, pval = chi2(sub_word_vector,train_y) elif args.feature_selection_measure == SelectionMeasure.f: feature_values, pval = f_classif(sub_word_vector,train_y) else: feature_values = mutual_info_classif(sub_word_vector,train_y) feature_value_id_map = {} for i in range(len(feature_values)): feature_value_id_map[ sub_word_indecies[i] ] = feature_values[i] sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True) chosen_words = [] for i in range(feature_size): if i >= len(sorted_features): continue chosen_words.append( sorted_features[i][0] ) X_new = [] # add category and type features if needed for k in train: single_x = [] single_data = feature_data[k] if not args.no_words: for w in chosen_words: if w in single_data["word_features"]: single_x.append(single_data["word_features"][w]) else: single_x.append(0) if args.use_category: if single_data["category"]: for c in sub_categories: if c in single_data["category"]: single_x.append(1) else: single_x.append(0) else: single_x += [0]*len(sub_categories) if args.use_stanford_type: if "ORGANIZATION" in single_data["type"]: single_x.append(1) else: single_x.append(0) if "LOCATION" in single_data["type"]: single_x.append(1) else: single_x.append(0) X_new.append(single_x) for k in test: single_x = [] single_data = feature_data[k] if not args.no_words: for w in chosen_words: if w in single_data["word_features"]: single_x.append(single_data["word_features"][w]) else: single_x.append(0) if args.use_category: if single_data["category"]: for c in sub_categories: if c in single_data["category"]: single_x.append(1) else: single_x.append(0) else: single_x += [0]*len(sub_categories) if args.use_stanford_type: if "ORGANIZATION" in single_data["type"]: single_x.append(1) else: single_x.append(0) if "LOCATION" in single_data["type"]: single_x.append(1) else: single_x.append(0) test_X.append(single_x) test_y.append(labels[k]) clf.fit(X_new,train_y) predicted_y = clf.predict(test_X) f1_vector.append(f1_score(test_y,predicted_y)) average_f1 = sum(f1_vector)/(1.0*len(f1_vector)) print "Average: %f" %(average_f1) if average_f1 > best_f1: best_f1 = average_f1 best_size = feature_size print "-"*20 print "best f1 is %f achieved by size %d" %(best_f1,best_size)
As our features in this probelm are discrete, I will use Scikit-learn’s mutual_info_classif class with the discrete_features=True flag: """ import numpy as np from sklearn.feature_selection import mutual_info_classif import pandas as pd datafile = '/model_data/training_data_newfeatures.csv' data = pd.read_csv(datafile, delimiter='\t', dtype=str) l_features = list(data.columns) discrete_dataset = np.loadtxt(datafile, dtype=str, delimiter='\t') X = discrete_dataset[:, :-1] y = discrete_dataset[:, -1] l_importance = mutual_info_classif(X, y, discrete_features=True) resdict = {} for i, res in enumerate(l_importance): # exclude data which is 1:1 with the job feature (as these will not be useful) if l_features[i] in ['... list of features ...']: continue resdict[l_features[i]] = res print 'MI:' for elt in sorted(resdict.items(), key=lambda x: x[1], reverse=True): print elt print '\n' resdict2 = {}