def rank(training_set, paradigm_lengths, category_description): transfomer = DataTransformer(training_set, paradigm_lengths, category_description) headlines, matrix, targets = transfomer.get_training_data_matrix(normalize=True) matrix = matrix.toarray() estimator = svm.SVC(C=1, kernel='linear') selector = RFE(estimator, 1, step=1) selector = selector.fit(matrix, targets) for i in range(len(headlines)): print headlines[i], selector.ranking_[i]
def get_feature_percentage(training_set, paradigm_lengths, category_description): transfomer = DataTransformer(training_set, paradigm_lengths, category_description) headlines, matrix, targets = transfomer.get_training_data_matrix(normalize=True) matrix = matrix.toarray() forest = ExtraTreesClassifier(n_estimators=10) forest.fit(matrix, targets) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] dict = {} for f in range(matrix.shape[1]): dict[headlines[indices[f]]] = importances[indices[f]] return dict