def select_features(X, y): from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif,chi2 from sklearn.preprocessing import Binarizer, scale # First select features based on chi2 and f_classif p = 3 X_bin = Binarizer().fit_transform(scale(X)) selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y) selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y) chi2_selected = selectChi2.get_support() chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]] print('Chi2 selected {} features {}.'.format(chi2_selected.sum(), chi2_selected_features)) f_classif_selected = selectF_classif.get_support() f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]] print('F_classif selected {} features {}.'.format(f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected print('Chi2 & F_classif selected {} features'.format(selected.sum())) features = [ f for f,s in zip(X.columns, selected) if s] print (features) return features
def buildVectorizer(classes, examples, parameters): featureChoice = None doFeatureSelection = False tfidf = False featureSelectPerc = 10 if "featureChoice" in parameters: featureChoice = parameters["featureChoice"] if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True": doFeatureSelection = True if "featureSelectPerc" in parameters: featureSelectPerc = int(parameters["featureSelectPerc"]) if "tfidf" in parameters and parameters["tfidf"] == "True": tfidf = True print "Starting vectorizer..." vectorizer = Vectorizer(classes,examples,featureChoice,tfidf) vectors = vectorizer.getTrainingVectors() print "Vectors of size:", vectors.shape if doFeatureSelection: print "Trimming training vectors..." from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2 #featureSelector = SelectKBest(chi2, k=100)`: featureSelector = SelectPercentile(chi2,featureSelectPerc) vectorsTrimmed = featureSelector.fit_transform(vectors, classes) vectorsTrimmed = coo_matrix(vectorsTrimmed) print "Trimmed training vectors of size:", vectorsTrimmed.shape else: vectorsTrimmed = vectors featureSelector = None return vectorsTrimmed,vectorizer,featureSelector
def selectFeatures(features, labels, features_list): ''' Select features according to the 20th percentile of the highest scores. Return a list of features selected and a dataframe showing the ranking of each feature related to their p values features: numpy array with the features to be used to test sklearn models labels: numpy array with the real output features_list: a list of names of each feature ''' #feature selection selector = SelectPercentile(f_classif, percentile=20) selector.fit(features, labels) features_transformed = selector.transform(features) #filter names to be returned l_rtn = [x for x, t in zip(features_list, list(selector.get_support())) if t] # pd.DataFrame(features_transformed, columns = l_labels2).head() #calculate scores scores = -np.log10(selector.pvalues_) scores /= scores.max() df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores)))) df_rtn.columns = ["pValue_Max"] df_rtn = df_rtn.sort("pValue_Max", ascending=False) # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0]) return l_rtn, df_rtn
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"): ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Enrique training emails:", sum(labels_train) print "no. of Juan training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def feature_select(self): b = SelectPercentile(f_classif, percentile=task.percentile) y = np.array(self.results[self.task.label].data) X = np.array(self.results[self.task.features].data) data = pd.DataFrame(b.fit_transform(X, y)) result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def select_features(X,y): selector = SelectPercentile(f_classif, percentile=10) print "fit selector" selector.fit(X, y) print "transform features" X = selector.transform(X) return X,selector
def get_user_feature(feature_type,behavior,num_feature=800): X_train = get_features(feature_type,behavior) index = X_train.index # 对X进行降维 Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type'] print 'start selectKbest...' # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1])) percent = 0 if feature_type == 'cat_id': percent = 60 elif feature_type == 'brand_id': percent = 15 elif feature_type == 'seller_id': percent = 20 select = SelectPercentile(f_classif, percentile=percent) select.fit(X_train,Y) X_train = select.transform(X_train) print 'end select...' print 'write %s features to train file' % feature_type train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_train,index=index).to_csv(train_feature_file_name) # 用同样的列降维对应的测试集数据 X_test = get_features(feature_type,behavior,is_train=False) index = X_test.index X_test = select.transform(X_test) # 写入文件 print 'write %s features to test file' % feature_type test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_test,index=index).to_csv(test_feature_file_name) print 'end....'
def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file): sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name) y = [] X = sorted_train_data.iloc[:,1:] fip = open('data/' + train_label_file) lines = fip.readlines() for line in lines: line = line.rstrip() y.append(int(line)) print("Final feature reduction: {:s}".format(reduced_feature_file_name)) print("Training labels length: {:d}".format(len(y))) print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1])) print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1])) # find the top 10 percent variance features, from ~1000 -> ~100 features fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X,y) print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1])) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 #data_reduced = sorted_train_data.iloc[:,[0] + selected_names] #Does not put the file_name as the first column. data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['file_name']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/' + final_file_name, index=False) print("Completed reduction in {:s}".format(final_file_name)) return
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def preprocess(word_data, targets): print("\n### PREPROCESSING DATA ###") # vectorize print("-- Vectorization") vectorizer = TfidfVectorizer(sublinear_tf=True) # , stop_words='english' data_transformed = vectorizer.fit_transform(word_data) # feature selection print("-- Feature Selection") selector = SelectPercentile(percentile=5) data_selected = selector.fit_transform(data_transformed, targets) if data_selected.shape[1] == 0: data_selected = data_transformed else: print("Top {} features were selected".format(data_selected.shape[1])) # print top features nr_features = 30 i = selector.scores_.argsort()[::-1][:nr_features] top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i], selector.scores_[i], selector.pvalues_[i])) print("\nTop %i Features:" % nr_features) print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n") features_train, features_test, labels_train, labels_test = \ train_test_split(data_selected, targets, test_size=0.2, stratify=targets) return features_train, features_test, labels_train, labels_test
def main(): main_data = pd.read_csv('../data/train.csv', index_col='ID') output = [] for x in main_data.columns: output.append({ 'variable': x, 'variance': main_data.ix[:, x].var(), 'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4), 'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))} ) # print csv for later in the presentation docs variable_selector = pd.DataFrame(output) variable_selector = variable_selector.set_index('variable') variable_selector = variable_selector.drop('TARGET') variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv') selector = SelectPercentile(f_classif, percentile=25) subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET'])) subset.to_csv('../data/main_data.csv', index=False) main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False) # print transformed test data to csv test_data = pd.read_csv('../data/test.csv', index_col='ID') test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index) test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile selector = SelectPercentile(f_classif, percentile=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def main(): parser = argparse.ArgumentParser(description='Feature Selection') required = parser.add_argument_group('required options') required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') required.add_argument('-y', '--targetdata', required=True, help='File containiing target data') required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features') args = parser.parse_args() X = np.loadtxt(args.scaledfeaturelist) Y = np.genfromtxt(args.targetdata,dtype='str') #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y) sel = SelectPercentile(f_classif, percentile=args.fetpercentile) result = sel.fit_transform(X,Y) #selecting features for test programs if os.path.isfile('variancefeatures.txt'): varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str') featureFromSelectPercentile = sel.get_support(indices=True) featureFileforSelectPercentile = open("featuresToTestPrograms","w") for i in featureFromSelectPercentile: featureFileforSelectPercentile.write(varianceFeature[i]) featureFileforSelectPercentile.write("\n") featureFileforSelectPercentile.close() #remove the variancefeatures as we don't need it anymore rm variancefeatures.txt np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
def univariant_feature_selection(self,method, X, y,percentile): test=SelectPercentile(method , percentile=percentile).fit(X, y) print("The number of feature in ", method, " is: ", (test.get_support().sum()) ) for i in range(len(self.X_train.columns)): if(test.get_support()[i]): print(self.X_train.columns[i]) return test.get_support()
def build_linear_model(X, y, analyzerType): tfv = vectorizer(analyzerType) select = SelectPercentile(score_func=chi2, percentile=15) clf = SVC(C=12.0, kernel='linear') X = tfv.fit_transform(X) X = select.fit_transform(X, y) return (clf.fit(X, y), tfv, select)
def test(X, y): ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_regression, percentile=20) selector.fit(X, y) print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def main(path,filename): #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] percentil = 20 X = [] y = [] lens = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) lens.append(len(X[0])) total = [lens[0]] for i in xrange(1,len(lens)): total.append(lens[i]-lens[i-1]) print 'Cantidad de atributos por barch' print total sp = SelectPercentile(chi2,percentil) X_new = sp.fit_transform(X, y) sup = sp.get_support(True) #print sup res = [0]* len(batchs) for i in sup: for j in xrange(0,len(lens)): if i <= lens[j]: res[j] +=1 break porcentajes = [] for i in xrange(0,len(lens)): porcentajes.append((1.0*res[i])/total[i]) print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado' print res print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado' print porcentajes clf = ExtraTreesClassifier() clf = clf.fit(X, y) fi = clf.feature_importances_ res2 = [0]* len(batchs) for i in xrange(0,len(fi)): for j in xrange(0,len(lens)): if i <= lens[j]: res2[j] += fi[i] break print 'Importancia porcentual acumulada de la seleccion multivariada' print res2 porcentajes2 = [] for i in xrange(0,len(lens)): porcentajes2.append((1.0*res2[i])/total[i]) print 'Importancia porcentual promedio por variable de la seleccion multivariada' print porcentajes2
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result # selector = SelectPercentile(f_classif, percentile=10) ## <Temporary hack for Lesson 3> selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Chris training emails:", sum(labels_train) print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def feature_selection(self,mode='F'): print 'Feature Selection...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') X=self.train.copy() y=self.train_label['label'].values.copy() test=self.test.copy() if mode.upper()=='M': mi=mutual_info_classif(train.values,train_label['label'].values) elif mode.upper()=='F': F,pval=f_classif(train.values,train_label['label'].values) elif mode.upper()=='C': chi,pval=chi2(train.values,train_label['label'].values) features=self.train.columns.copy() fs_features=features.copy().tolist() if mode.upper()=='M': fs_V=mi.copy().tolist() elif mode.upper()=='F': fs_V=F.copy().tolist() elif mode.upper()=='C': fs_V=chi.copy().tolist() if mode.upper()=='M': selector=SelectPercentile(mutual_info_classif,percentile=80) elif mode.upper()=='F': selector=SelectPercentile(f_classif,percentile=80) elif mode.upper()=='C': selector=SelectPercentile(chi2,percentile=80) X_new=selector.fit_transform(X,y) selected=selector.get_support() for i in xrange(len(features)): if selected[i]==False: t=features[i] fs_features.remove(t) fs_V=np.array(fs_V) fs_features=np.array(fs_features) self.train=pd.DataFrame(X_new,columns=fs_features.tolist()) self.test=test[fs_features] self.fs_features=fs_features feas=pd.DataFrame() feas['feature']=fs_features print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return X_new,feas
def selectFeatures(Model, X, y): model = Model() fsel = SelectPercentile(score_func=f_classif, percentile=5) fsel.fit(X, y) arr = fsel.get_support() print "features: ", np.where(arr == True) plt.hist(model.predict(X)) plt.hist(y) plt.show()
def getWeights(self): # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(self.X, self.y) scores = -np.log10(selector.pvalues_) scores /= float(scores.max()) return scores
def selectFeatures(X, y): # feature selection with F-test for feature scoring # 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() return selector, scores
def predict(classifier_type="tree",selection="Univariate", f="1"): if (f=="1"): kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl" p = 1 BIG_C = 0.001 if (f=="2"): kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl" p = 5 BIG_C = 0.1 if (f=="3"): kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl" p = 25 BIG_C = 2 dump_data = False kernel_type = "linear" (data_matrix, features, samples) = readData() x = data_matrix.data y = data_matrix.target target_names = data_matrix.target_names x_indices = np.arange(x.shape[-1]) (m,n) = x.shape test = joblib.load("GS_pickles\imputed_test_data.pkl") test_x = np.array(test) (i,j) = test_x.shape print "Training matrix shape: %s,%s" %(m,n) print "Test matrix shape: %s,%s" %(i,j) trimmed_x = [] trimmed_test_x = [] if (selection=="Univariate"): selector = SelectPercentile(f_classif, percentile=p) selector.fit(x, y) # Trimming the matrix, now should contain x% of the 8650 features trimmed_x = selector.transform(x) trimmed_test_x = selector.transform(test_x) if (selection=="kclusters"): kcluster_flist = joblib.load(kc_fn) trimmed_x = np.take(x, kcluster_flist, axis=1) trimmed_test_x = np.take(test_x, kcluster_flist, axis=1) n_samples, n_features = trimmed_x.shape # Linear SVM classifier if (classifier_type=="SVM"): clf = svm.SVC(kernel=kernel_type, degree=3, probability=True) # Gaussian Naive Bayes classifier if (classifier_type=="NB"): clf = GaussianNB() clf.fit(trimmed_x,y) result = clf.predict(trimmed_test_x) return result
def univariate_feature_selection(dataset, features): # load the dataset spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx') data = Data(spreadsheet) targets = data.targets X = dataset y = data.targets ############################################################################### plt.figure(1) plt.clf() X_indices = np.arange(X.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') x = np.arange(0, len(features)) plt.title("Comparing feature selection") plt.xlabel('Feature number') plt.xticks(x, features, rotation=45) plt.yticks(()) #plt.axis('tight') plt.legend(loc='upper right') plt.show()
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components = n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat,p_vals,expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:],retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0,'poi') return features_list
def eval(ds, testNum, p, splitProportion=0.2): #testNum=1 #splitProportion=0.2 allFeaturesF1=[] allFeaturesRecall=[] allFeaturesPrecision=[] featureSelctedF1=[] featureSelctedRecall = [] featureSelctedPrecision = [] for _ in range(testNum): tstdata, trndata = ds.splitWithProportion( splitProportion ) X, Y = labanUtil.fromDStoXY(trndata) X_test, Y_test = labanUtil.fromDStoXY(tstdata) #localF1s = [] #localRecalls = [] #localPercisions = [] for y, y_test in zip(Y, Y_test): if all(v == 0 for v in y): continue #clf = LinearSVC()#fit_intercept=True, C=p) #clf.sparsify() #clf = RandomForestClassifier()#criterion='entropy') #clf = tree.DecisionTreeClassifier()#max_depth=p) clf = AdaBoostClassifier() #clf = GradientBoostingClassifier()#, learning_rate=lr) #clf = ExtraTreesClassifier(n_estimators=p) #svc = LinearSVC() #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2) selector = SelectPercentile(chooser, percentile=p) selector.fit(X, y) name = str(clf).split()[0].split('(')[0] clf.fit(selector.transform(X), y) pred = clf.predict(selector.transform(X_test)) featureSelctedF1.append(metrics.f1_score(y_test, pred)) featureSelctedRecall.append(metrics.recall_score(y_test, pred)) featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) clf.fit(X, y) pred = clf.predict(X_test) allFeaturesF1.append(metrics.f1_score(y_test, pred)) allFeaturesRecall.append(metrics.recall_score(y_test, pred)) allFeaturesPrecision.append(metrics.precision_score(y_test, pred)) return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \ np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \ np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \ name
df_test_X = pd.read_csv('final_X_test.txt') df_test_y = pd.read_csv('final_y_test.txt') df_train_X.dropna(0, inplace=True) df_test_X.dropna(0, inplace=True) ''' #pca pca = PCA(n_components=100) df_train_X = pca.fit_transform(df_train_X) df_test_X = pca.fit_transform(df_test_X) ''' #feature selection varience select = SelectPercentile(percentile=90) df_train_X = select.fit_transform(df_train_X,df_train_y) df_test_X = select.transform(df_test_X) #feature scaling min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) df_train = min_max_scaler.fit_transform(df_train_X) df_test = min_max_scaler.fit_transform(df_test_X) #training svm=SVC(gamma=0.1, kernel='rbf', C=3) svm.fit(df_train, df_train_y) #prediction y_train_predicted=svm.predict(df_train)
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project # authors_file_handler = open(authors_file, "r") # authors = pickle.load(authors_file_handler) # authors_file_handler.close() authors_file_handler = open(authors_file, "rb") authors = pickle.load(authors_file_handler) authors_file_handler.close() # words_file_handler = open(words_file, "r") # word_data = cPickle.load(words_file_handler) # words_file_handler.close() words_file_handler = open(words_file, "rb") word_data = pickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) # features_train, features_test, labels_train, labels_test = sklearn.cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) features_train, features_test, labels_train, labels_test = sklearn.model_selection.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data # print "no. of Chris training emails:", sum(labels_train) print("\nno. of Chris training emails: - sum(labels_train) - {}".format(sum(labels_train))) # print "no. of Sara training emails:", len(labels_train)-sum(labels_train) print("no. of Sara training emails: - sum(labels_train) - {}\n".format(len(labels_train) - sum(labels_train))) return features_train_transformed, features_test_transformed, labels_train, labels_test
def predict(self, X): X_transformed = X for step in self.steps[:-1]: X_transformed = step[1].transform(X_transformed) return self.steps[-1][1].predict(X_transformed) #####정보누설 #무작위 데이터 생성 import numpy as np rnd = np.random.RandomState(seed=0) X = rnd.normal(size=(100,10000)) y = rnd.normal(size=(100,)) from sklearn.feature_selection import SelectPercentile, f_regression select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y) X_selected = select.transform(X) print("X_selected shape : {}".format(X_selected.shape)) from sklearn.model_selection import cross_val_score from sklearn.linear_model import Ridge print("cross val score(릿지) : {:.3f}".format(np.mean( cross_val_score(Ridge(), X_selected, y, cv=5)))) #무작위 데이터라 연관이 없을텐데 R^2값이 0.91로 좋게 나옴 > 전체 데이터 사용 때문 pipe = Pipeline([("select", SelectPercentile(score_func=f_regression,percentile=5)), ("ridge", Ridge())]) print("cross val score : {:.3f}".format(np.mean(cross_val_score(pipe, X, y, cv=5)))) #파이프라인 사용시 R^2값 음수 > 정보누설 막음 ##make_pipeline : 단계 이름을 자동으로 생성 from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBRegressor import pickle #add pickle from sklearn.metrics import r2_score _data = open("data_BA.pkl", "rb") X, y = pickle.load(_data) _data.close() # Average CV score on the training set was: -3.2532849505281343 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=89), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=50, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant",
class multiple_classifiers_with_pruned_tree(abstract_classifier): def __init__(self, data, labels, **kwargs): self.args = kwargs self.ada = AdaBoostClassifier(n_estimators=50) self.tree = tree.DecisionTreeClassifier(max_depth=8) self.knn = KNeighborsClassifier(n_neighbors=1) self.sp_knn = SelectPercentile(percentile=24) self.sp_tree = SelectPercentile(percentile=kwargs['tree_per']) self.sp_ada = SelectPercentile(percentile=85) data_knn = self.sp_knn.fit_transform(data, labels) data_tree = self.sp_tree.fit_transform(data, labels) data_ada = self.sp_ada.fit_transform(data, labels) self.knn.fit(data_knn, labels) self.ada.fit(data_ada, labels) # Fit pruned tree validation_size = 100 train_data = data_tree[:validation_size] train_labels = labels[:validation_size] validation_data = data_tree[validation_size:] validation_labels = labels[validation_size:] self.tree.fit(train_data, train_labels) self.prune(self.tree, 0, validation_data, validation_labels) def prune(self, tree_obj, index, validation_data, validation_labels): # based on https://stackoverflow.com/a/49496027 inner_tree = tree_obj.tree_ left_child = inner_tree.children_left[index] right_child = inner_tree.children_right[index] if left_child != -1: self.prune(tree, left_child, validation_data, validation_labels) if right_child != -1: self.prune(tree, right_child, validation_data, validation_labels) predictions_no_prune = tree_obj.predict(validation_data) errors_no_prune = (predictions_no_prune ^ validation_labels).sum() inner_tree.children_left[index] = -1 inner_tree.children_right[index] = -1 predicitions_prune = tree_obj.predict(validation_data) errors_prune = (predicitions_prune ^ validation_labels).sum() if errors_prune > errors_no_prune: inner_tree.children_left[index] = left_child inner_tree.children_right[index] = right_child def classify(self, features): features_mat = features.reshape((1, -1)) features_knn = self.sp_knn.transform(features_mat) features_tree = self.sp_tree.transform(features_mat) features_ada = self.sp_ada.transform(features_mat) w1 = self.args.get('w1', 1) w2 = self.args.get('w2', 1) w3 = self.args.get('w3', 1) p1 = int(self.knn.predict(features_knn)[0]) p2 = int(self.ada.predict(features_ada)[0]) p3 = int(self.tree.predict(features_tree)[0]) avg = (w1*p1 + w2*p2 + w3*p3)/(w1 + w2 + w3) return bool(np.round(avg))
import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_regression from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -0.9547903226888407 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=12), MaxAbsScaler(), LassoLarsCV(normalize=False)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.8326392221287445 exported_pipeline = make_pipeline( make_union( make_pipeline( make_union( make_union( FunctionTransformer(copy), StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=7, n_estimators=100)) ), make_union( FunctionTransformer(copy), FunctionTransformer(copy) ) ), SelectPercentile(score_func=f_classif, percentile=58) ), FunctionTransformer(copy) ), MultinomialNB(alpha=0.1, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# clf = tree.DecisionTreeClassifier(criterion='entropy') clf = tree.DecisionTreeClassifier(criterion='gini') print(clf) clf.fit(x_train, y_train) # print clf.feature_importances_ for m in range(len(clf.feature_importances_)): if clf.feature_importances_[m]>0.005: print "feature_importance",m,clf.feature_importances_[m] # np.savetxt("feat_importance.txt",clf.feature_importances_) '''''write the tree to zhe file ''' with open("tree.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) ''''' Number as zhe impact of feature,bigger better''' from sklearn.feature_selection import SelectPercentile,f_classif selector = SelectPercentile(f_classif,percentile=1) selector.fit(x_train, y_train) result_dic={} num=0 # f=open("string_data.txt") # lines=f.readlines() # for line in lines: # result_dic[line]=selector.pvalues_[num] # num=num+1 # num2=0 # for line in lines: # if result_dic[line]<0.001: # print "rs;",line,"pvalues:",selector.pvalues_[num2],"num:",num2 # num2=num2+1 scores = -np.log10(selector.pvalues_) scores /= scores.max()
print("First line after cleanup from test Data: ", linesOfTrainData[0]) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( linesOfTrainData, linesOfTestData, test_size=0.1, train_size=0.9, random_state=42) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') linesOfTrainData_Transformed = vectorizer.fit_transform(features_train) linesOfTestData_Transformed = vectorizer.transform(features_test) selector = SelectPercentile(f_classif, percentile=10) selector.fit(linesOfTrainData_Transformed, labels_train) #labels_train linesOfTrainData_Transformed = selector.transform(linesOfTrainData_Transformed) linesOfTestData_Transformed = selector.transform(linesOfTestData_Transformed) f = open('TestData/Test/format_out.dat', 'w') for vt in linesOfTestData_Transformed: cosineSimilarityValues = [] for vS in linesOfTrainData_Transformed: dotProduct = vt.dot(np.transpose(vS)) lengtht = np.linalg.norm(vt.data) lengthS = np.linalg.norm(vS.data) #handle exceptions if lengthS != 0 and lengtht != 0:
clf.fit(X_train, y_train) #y_margins = clf.decision_function(X_devel) ''' y_prob = (y_margins - y_margins.min()) / (y_margins.max() - y_margins.min()) y_prob = 1./(1 + np.exp(-y_margins)) ''' y_prob_devel = clf.predict_proba(X_devel) y_prob_test = clf.predict_proba(X_test) y_pred = clf.predict(X_test) np.save('./predictions/SD_devel_svm_baseline.npy', y_prob_devel) np.save('./predictions/SD_test_svm_baseline.npy', y_prob_test) else: uar = [] percentile = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for p in percentile: selection = SelectPercentile(f_classif, percentile=p) feat_selected = selection.fit_transform(X_train, y_train) feat_devel = selection.transform(X_devel) print('\nComplexity {0:.6f}'.format(optimum_complexity)) #clf = svm.LinearSVC(C=optimum_complexity, random_state=0) clf = svm.SVC(C=optimum_complexity, kernel='linear', random_state=0) clf.fit(feat_selected, y_train) y_pred = clf.predict(feat_devel) uar.append( recall_score(y_devel, y_pred, labels=classes, average='macro')) print('UAR on Devel {0:.1f}'.format(uar[-1] * 100)) if show_confusion: print('Confusion matrix (Devel):') print(classes) print(confusion_matrix(y_devel, y_pred, labels=classes)) optimum_percentile = percentile[np.argmax(uar)]
import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB, GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import StandardScaler from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8395341050959029 exported_pipeline = make_pipeline( StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=10, p=2, weights="uniform")), SelectPercentile(score_func=f_classif, percentile=90), SelectPercentile(score_func=f_classif, percentile=87), StackingEstimator(estimator=GaussianNB()), StandardScaler(), BernoulliNB(alpha=0.1, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#KNeighborsClassifier steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()), ('clf', KNeighborsClassifier())] pipeline = Pipeline(steps) parameteres = [{ 'scaler': scalers_to_test, 'red_dim': [PCA(random_state=42)], 'red_dim__n_components': n_features_to_test, 'clf__n_neighbors': k, 'clf__weights': ['uniform', 'distance'], 'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] }, { 'scaler': scalers_to_test, 'red_dim': [SelectPercentile(f_classif, percentile=10)], 'clf__n_neighbors': k, 'clf__weights': ['uniform', 'distance'], 'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] }, { 'scaler': scalers_to_test, 'red_dim': [SelectPercentile(mutual_info_classif, percentile=10)], 'clf__n_neighbors': k, 'clf__weights': ['uniform', 'distance'], 'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] }, { 'scaler': scalers_to_test, 'red_dim': [None], 'clf__n_neighbors': k,
def predict_NC(): #feature selection X, y, vectorizer = get_X_y() #selector = SelectKBest(f_classif,10000) selector = SelectPercentile(f_classif,percentile=100) selector.fit(X,y) best_indices = selector.get_support(indices=True) best_features = np.array(vectorizer.get_feature_names())[best_indices] X = selector.transform(X) #use cross validation to choose the best parameter lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto') kf = StratifiedKFold(y,n_folds=5,shuffle=True) parameters = {"C":[1.0,.1, .01, .001,0.0001]} clf0 = GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf) print "fitting model..." clf0.fit(X,y) print "best auc score is: " ,clf0.best_score_ print "done." #cross validation on the best parameter #get precision recall accuracy auc_score fs, aucs,prec,rec = [],[],[],[] fold = 0 complete_X = X.tocsr() clf = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=clf0.best_estimator_.C) for train, test in kf: clf.fit(complete_X[train,:].tocoo(), y[train]) probs = clf.predict_proba(complete_X[test,:])[:,1] average_precision_score(y[test],probs) precision,recall,threshold = precision_recall_curve(y[test],probs) accuracy = clf.score(complete_X[test,:], y[test]) predLabel = clf.predict(X[test,:]) rec.append(recall_score(y[test],predLabel)) prec.append(precision_score(y[test],predLabel)) #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs)) cur_auc = auc_score(y[test], probs) aucs.append(cur_auc) #preds = clf.predict(complete_X[test]) #fs.append(f1_score(y[test], preds)) ''' if fold == 0: plt.clf() plt.plot(precision,recall,label='Precision-Recall curve for news coverage prediction') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0,1.05]) plt.xlim([0.0,1.0]) plt.title('Precision-Recall curve for news coverage prediction with vocabulary size %d' %len(best_features)) plt.show() fold += 1 ''' if fold == 0: fpr, tpr, thresholds = roc_curve(y[test], probs) pylab.clf() fout = "NC/roc" pylab.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % cur_auc) pylab.plot([0,1], [0,1], 'k--') pylab.xlim((-0.025,1.025)) pylab.ylim((-0.025,1.025)) pylab.xlabel("false positive rate") pylab.ylabel("true positive rate") pylab.title("ROC curve for news coverage prediction(area = %0.2f)" % cur_auc) pylab.tight_layout() pylab.savefig(fout) fold += 1 #print "average auc: %s" % (sum(aucs)/float(len(aucs))) #print "average fs: %s" % (sum(fs)/float(len(fs))) print "average recall: %s" % (sum(rec)/float(len(rec))) print "average precision: %s" % (sum(prec)/float(len(prec))) #print "ABOUT TO RETURN" #pdb.set_trace() texify_most_informative_features(best_features,vectorizer, clf0) return clf0
svc, fmri_masked, conditions, cv=cv, groups=session_label)[1] print("Permutation test score: {:.3f}".format(null_cv_scores.mean())) ########################################################################### # Decoding without a mask: Anova-SVM in scikit-lean # -------------------------------------------------- # We can also implement feature selection before decoding as a scikit-learn # `pipeline`(:class:`sklearn.pipeline.Pipeline`). For this, we need to import # the :mod:`sklearn.feature_selection` module and use # :func:`sklearn.feature_selection.f_classif`, a simple F-score # based feature selection (a.k.a. `Anova <https://en.wikipedia.org/wiki/Analysis_of_variance#The_F-test>`_), from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import cross_validate from sklearn.svm import LinearSVC feature_selection = SelectPercentile(f_classif, percentile=10) anova_svc = Pipeline([('anova', feature_selection), ('svc', LinearSVC())]) # We can use our ``anova_svc`` object exactly as we were using our ``svc`` # object previously. # As we want to investigate our model, we use sklearn `cross_validate` function # with `return_estimator = True` instead of cross_val_score, to save the estimator fitted_pipeline = cross_validate(anova_svc, fmri_masked, conditions, cv=cv, groups=session_label, return_estimator=True) print( "ANOVA+SVC test score: {:.3f}".format(fitted_pipeline["test_score"].mean())) ########################################################################### # Visualize the ANOVA + SVC's discriminating weights # ...................................................
# Create the training data and label # We need to take the balanced data training_data = [arr for idx_arr, arr in enumerate(data_bal) if idx_arr != idx_lopo_cv] training_label = [arr for idx_arr, arr in enumerate(label_bal) if idx_arr != idx_lopo_cv] # Concatenate the data training_data = np.vstack(training_data) training_label = np.ravel(label_binarize( np.hstack(training_label).astype(int), [0, 255])) print 'Create the training set ...' # Perform the classification for the current cv and the # given configuration # Feature selector sel = SelectPercentile(f_classif, p) training_data = sel.fit_transform(training_data, training_label) testing_data = sel.transform(testing_data) crf = RandomForestClassifier(n_estimators=100, n_jobs=-1) pred_prob = crf.fit(training_data, training_label).predict_proba( testing_data) results_cv.append([pred_prob, crf.classes_]) feat_imp_cv.append(sel.get_support(indices=True)) results_p.append(results_cv) feat_imp_p.append(feat_imp_cv) # Save the information path_store = '/data/prostate/results/mp-mri-prostate/exp-3/selection-extraction/anova/t2w' if not os.path.exists(path_store):
train_row = dec.transform(train["device_id"]) train_sp = sparse_matrix[train_row, :] test_row = dec.transform(test["device_id"]) test_sp = sparse_matrix[test_row, :] X_train, X_val, y_train, y_val = train_test_split(train_sp, Y, train_size=.90, random_state=10) ################## # Feature Sel ################## print("# Feature Selection") selector = SelectPercentile(f_classif, percentile=23) selector.fit(X_train, y_train) X_train = selector.transform(X_train) X_val = selector.transform(X_val) train_sp = selector.transform(train_sp) test_sp = selector.transform(test_sp) print("# Num of Features: ", X_train.shape[1]) ################## # Build Model ##################
# get deterministic random numbers rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # add noise features to the data # the first 30 features are from the dataset, the next 50 are noise X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5) # use f_classif (the default) and SelectPercentile to select 50% of features select = SelectPercentile(percentile=50) select.fit(X_train, y_train) # transform training set X_train_selected = select.transform(X_train) print("X_train.shape: {}".format(X_train.shape)) print("X_train_selected.shape: {}".format(X_train_selected.shape)) mask = select.get_support() print(mask) # transform test data X_test_selected = select.transform(X_test) lr = LogisticRegression() lr.fit(X_train, y_train)
n_folds=i, shuffle=True, random_state=1) scores = cross_val_score(regression, wine, quality, scoring="mean_squared_error", cv=crossvalidation, n_jobs=1) print("Folds: %i, mean squared error: %.2f std: %.2f" % (len(scores), np.mean(np.abs(scores)), np.std(scores))) #the mean quared error is still the same. we need feature seletion to see if we can get better results #print all f_scores for each feature f_selector = SelectPercentile(f_regression, percentile=25) f_selector.fit(wine, quality) for feature, score in zip(wine.columns.values, f_selector.scores_): print("F-Score: %3.2f\t for feature %s" % (score, feature)) """ we can see that some features are not important for the regression with a greedy search we can get the optimal number of features """ greedy = RFECV(estimator=regression, cv=13, scoring="mean_squared_error") greedy.fit(wine, quality) print("Optimal number of features: %d" % greedy.n_features_) #however i wanna test logistic regression now because y data look like data to be classified. We might get better results logistic = LogisticRegression() ovr = OneVsRestClassifier(LogisticRegression()).fit(x_train, y_train) ovo = OneVsOneClassifier(LogisticRegression()).fit(x_train, y_train)
from newspaper import Article urls = [ 'http://www.newsmax.com/Politics/putin-tv-trump-dangerous/2017/04/17/id/784706/', 'http://www.hollywoodreporter.com/heat-vision/star-wars-rare-archival-footage-shown-at-celebration-had-funny-new-hope-f-bomb-994552?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thr%2Ffilm+%28The+Hollywood+Reporter+-+Movies%29&utm_content=FeedBurner', 'http://www.espn.com/sports/endurance/story/_/id/19177433/boston-marathon-2017-devin-wang-another-year-brings-closure-tragedy' ] prediction_data = [] for url in urls: article = Article(url) article.download() article.parse() soupText = BeautifulSoup(article.text) prediction_data.append(soupText.get_text()) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train_vect) features_test_transformed = vectorizer.transform(prediction_data) selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_test_transformed = selector.transform( features_test_transformed).toarray() pred = clf.predict(features_test_transformed) print pred
pct = 0.8 # percent of edges kept in feature selection alphas = 10**np.linspace(10, -2, 100) * 0.5 # specify alphas to search #%% rg_grid = GridSearchCV(Ridge(normalize=False), cv=10, param_grid={'alpha': alphas}, iid=False) # using LASSO regression instead of ridge lasso = linear_model.Lasso lasso_grid = GridSearchCV(lasso(normalize=False), cv=10, param_grid={'alpha': alphas}, iid=False) reg = Pipeline([('feature_selection', SelectPercentile(f_regression, percentile=pct)), ('regression', lasso_grid)]) cv10 = KFold(n_splits=21) #, random_state=665) rpcv10 = RepeatedKFold(n_splits=3, n_repeats=3, random_state=665) # %% Run model start = time.time() # time the function all_pred = cross_val_predict(reg, vecs_reshape.T, y, cv=cv10, n_jobs=4) #all_score = cross_val_score(reg, vecs_reshape.T, y, cv=rpcv10, n_jobs=1) # repeated kfolds end = time.time() print(end - start) # print function running time # %% print(np.corrcoef(all_pred.T, y.T)) # %%
def getBestModel(N, xDf, yDf, emptyModel, paramGrid, features, doSelection=True): """ inputs: N - int - the number of times the model should be trained and evaluated. xDf - pandas dataframe - the rows represent the data points, the columns represent the features. These are the inputs into the model yDf - pandas dataframe - the rows represent the data points, there is only one column. This contains the the target values for the model. emptyModel - sklearn model - a valid sci-kit learn model with a 'fit' method. paramGrid - dictionary - the para_grid to be used with this model in a grid search. Note that each parameter name in the grid must start with 'model__' (two underscores). features - int or float - if int, then use SelectKBest where k='features'. If float, use SelectPercentile where 'features' is the percentage testSize - float - the percentage of the data that should be used for the testing set (if method=='split') doSelection - boolean - if true, then do feature selection. Otherwise, do not do feature selection. outputs: modelsList - the list of all 10 trained models. metricsDict - dictionary of the form {mae: [val1, val2,...val10], mape: [###],...}. The index of each model in 'trainedModelList' matches the index of the values in each list. NOTE: This assumes the data in xDf has been standardized or normalized before being used in this function. NOTE: It may be more efficient to do the feature selection and standardization before the doing the N-fold cv. I checked it, and it did choose the same features for every fold for my experiments, but it would be better to do this before the cv in case different folds chose different features. """ # initialize the dictionary that will have contain the evaluation results of all 10 models. # It will look like {'mae': [val1, val2,..., val10], 'rmse': ...) metricsDict = { 'mae': [], 'mape': [], 'rmse': [], 'r': [], 'rSq': [], 'explainedVariance': [] } # get the input features in the correct format X = xDf.values # put the target values in the correct format columnName = yDf.columns[0] y = [] # make the dataframe 'y' into a list of values for i in range(len(yDf.index)): # loop through every row of the dataframe y.append(yDf.iloc[i, 0]) # convert the list to a numpy array y = np.asarray(y) # make the cv settings cv = KFold(n_splits=N, shuffle=True) # standardization standardScaler = preprocessing.StandardScaler() # apply standardization X = standardScaler.fit_transform(X) if doSelection: # feature selection if type(features) == int: X = SelectKBest(f_regression, k=features).fit_transform(X, y) elif type(features) == float: featuresPercentile = features / 100.0 X = SelectPercentile(f_regression, percentile=featuresPercentile).fit_transform( X, y) else: raise ValueError( "The input 'features' is not an integer or a float.") # initialize list of trained models modelsList = [] # for every fold for train_index, test_index in cv.split(X): # get the train and test data xTrain, xTest, yTrain, yTest = X[train_index], X[test_index], y[ train_index], y[test_index] # do a grid search and K-fold cross validation numFolds = 5 # 5-Fold cross validation pipe = Pipeline(steps=[('model', emptyModel)]) # make the model with optimized hyperparameters via a grid search with cross validation model = GridSearchCV(estimator=pipe, param_grid=paramGrid, cv=KFold(n_splits=numFolds, shuffle=True), scoring='r2', return_train_score=False) # fit model model.fit(xTrain, yTrain) # add the model to the list modelsList.append(model) # get predictions pred = model.predict(xTest) trainPred = model.predict(xTrain) # find errors meanAbsoluteError = mean_absolute_error(yTest, pred) rootMeanSquaredError = np.sqrt(mean_squared_error(yTest, pred)) meanAbsPercError = mean_absolute_percentage_error(yTest, pred) trainMeanAbsoluteError = mean_absolute_error(yTrain, trainPred) trainRootMeanSquaredError = np.sqrt( mean_squared_error(yTrain, trainPred)) trainMeanAbsPercError = mean_absolute_percentage_error( yTrain, trainPred) # find the R^2 values (coefficient of determination) rSq = r2_score(yTest, pred) trainRSq = r2_score(yTrain, trainPred) # find the R values (Pearson coefficient of correlation) R = np.corrcoef(yTest, pred)[0][1] trainR = np.corrcoef(yTrain, trainPred)[0][1] # find explained variance explainedVar = explained_variance_score(yTest, pred) trainExplainedVar = explained_variance_score(yTest, pred) # add the metrics to metricsDict metricsDict['mae'].append(round(meanAbsoluteError * 2000, 3)) metricsDict['rmse'].append(round(rootMeanSquaredError * 2000, 3)) metricsDict['mape'].append(round(meanAbsPercError, 3)) metricsDict['rSq'].append(round(rSq, 3)) metricsDict['r'].append(round(R, 3)) metricsDict['explainedVariance'].append(round(explainedVar, 3)) ## return the results return modelsList, metricsDict
f.readline() csvreader = csv.reader(f, delimiter='\t') for row in csvreader: ID_test.append(row[0]) X_test.append(row[2]) # Use word and character features words = TfidfVectorizer(analyzer="word", binary=False, use_idf=True, stop_words="english", min_df=3) char = TfidfVectorizer(analyzer="char", binary=False, use_idf=True) # Use percentile-based feature selection select = SelectPercentile(score_func=chi2) # Stack the features together feat = FeatureUnion([('words', words), ('char', char)]) # Construct transformation pipeline text_clf = Pipeline([ ('feat', feat), # ('select', select), # ('clf', MultinomialNB()), ('clf', SGDClassifier(penalty='l2')) ]) # Set the parameters to be optimized in the Grid Search parameters = { 'feat__words__ngram_range': [(1, 5), (1, 6)],
stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): stemmer = PorterStemmer() tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems training_data = datasets.load_files(sys.argv[1], encoding="utf-8", decode_error='ignore') bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenize, stop_words='english') selector = SelectPercentile(chi2, 25) print("\nSVM\n") clf = LinearSVC(penalty="l2", dual=False, C=5.0) pipe_clf = Pipeline([('vectorizer', bigram_tfidf_vectorizer), ('selector', selector), ('classifier', clf)]) pipe_clf.fit(training_data.data, training_data.target) joblib.dump(pipe_clf, sys.argv[2])
train_reader = csv.reader(train_csv) cnt = 0 for tweet in train_reader: attr = tweet[CURRENT_ATTRIBUTE + 4] train_attrs.append(attr) cnt += 1 del train_attrs[0] # get y_train from train_attrs y_train = [[float(attr)] for attr in train_attrs] # chi-2 select features print "start feature selection" if (SELECTOR == 0): selector = SelectKBest(chi2, k=K_FOR_BEST) else: selector = SelectPercentile(score_func=chi2, percentile=SELECT_PERCENTILE) selector.fit(x_train, y_train) new_x_train = selector.transform(x_train) new_x_test = selector.transform(x_test) print "feature selection done" # convert y_train to right dimension # y_train = [attr[0] for attr in y_train] # regression print "start regression" clf = LinearRegression() clf = clf.fit(new_x_train, y_train) result = clf.predict(new_x_test) print "regression done" for item in result:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.feature_selection import SelectKBest, SelectPercentile # 5.1 MI for classification # get the MI info between each feature and the target mutual_info_classif(X_train, y_train) # 注意这个是专门classification # Use the SelectKBest method to select the TOP K variables selector = SelectKBest(mutual_info_classif, k = 10).fit(X_train, y_train) X_train.columns[selector.get_support()] # 5.2 MI for regression mutual_info_regression(X_train, y_train) # Select the top 10 percentile 注意这里是比例上面是个数 selector = SelectPercentile(mutual_info_regression, percentile = 10).fit(X_train, y_train) X_train.columns[selector.get_support()] #### 6. Fischer Score | Chi Square # 1. Measure the dependecy of 2 variables # 2. Suited for Categoriacl Variables # 3. Target should be binary # 4. Variable values should be non-negative, and typically boolean, frequencies or count # 5. It compares the observed distribution class with the different labels against the expected one, would there be no labels # 没怎么看懂 from sklearn.feature_selection import chi2 f_score = chi2(X_train, y_train)
stemmer.stem(word) for word in re.sub('[^a-zA-Z]', ' ', data_frame['Text'][i]).split() if not word in stopwords_set ]).lower() document.append(new_text) for i in range(0, df_x_test.shape[0]): new_text_2 = ' '.join([ stemmer.stem(word) for word in re.sub('[^a-zA-Z]', ' ', df_x_test['Text'][i]).split() if not word in stopwords_set ]).lower() document_test.append(new_text_2) df_x_test['removed_test'] = df_x_test['Text'].apply(lambda x: " ".join([ stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words ]).lower()) tfid_v = TfidfVectorizer(sublinear_tf=True, min_df=6, stop_words='english') #select_features = SelectKBest(chi2, k=2000) select_features = SelectPercentile(chi2, percentile=11.5) X = tfid_v.fit_transform(document).toarray() X_test = tfid_v.transform(document_test).toarray() y = data_frame.iloc[:, 0].values X = select_features.fit_transform(X, y) X_test = select_features.transform(X_test) classifier = LinearSVC(C=1.0, penalty='l1', max_iter=4500, dual=False) classifier.fit(X, y) y_pred = classifier.predict(X_test) np.savetxt('new.txt', y_pred, delimiter=" ", fmt="%s")
X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, 10:c - 1]), axis=1) scaled_features_train_df = pd.DataFrame(X_train1, index=X_train.index, columns=X_train.columns) scaled_features_test_df = pd.DataFrame(X_test1, index=X_test.index, columns=X_test.columns) # -------------- from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif # Write your solution here: skb = SelectPercentile(score_func=f_classif, percentile=20) predictors = skb.fit_transform(X_train1, Y_train) scores = list(skb.scores_) print(scaled_features_train_df.columns) top_k_index = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:predictors.shape[1]] top_k_predictors = [scaled_features_train_df.columns[i] for i in top_k_index] print(top_k_predictors) # -------------- from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score clf = OneVsRestClassifier(LogisticRegression()) clf1 = OneVsRestClassifier(LogisticRegression())
def __init__(self, data, labels, **kwargs): self.knn = KNeighborsClassifier(n_neighbors=1) self.sp_knn = SelectPercentile(percentile=kwargs['feature_percentile']) data_knn = self.sp_knn.fit_transform(data, labels) self.knn.fit(data_knn, labels)
class GenreClassifier(): def __init__(self, dataset='news'): ''' dataset = {'news', 'bruk'} ''' self.tfidf = None self.clf = None self.sel_perc = None if dataset == 'news': self.load('saves\\clf_news_MNB_poslex.pkl', 'saves\\tfidf_news_MNB_poslex.pkl', 'saves\\featsel_news_MNB_poslex.pkl') elif dataset == 'bruk': self.load('saves\\clf_bruk_MNB_poslex.pkl', 'saves\\tfidf_bruk_MNB_poslex.pkl') pass pass def init(self): freq_words = pd.read_csv('data\\freq_words.txt', sep=' ', header=None, names=['word', 'freq'])['word'].values lexngrams = np.loadtxt('data\\news_bigrams.txt', dtype=object, encoding='utf-8') print('Frequent words count:', len(freq_words)) print('Lexical ngrams count:', len(lexngrams)) self.factory = ClfFactoryPosLex( None, PosFreqWordsAnalyzer(Morphology().getAnalyzer(), list(freq_words), lemmatize_freq=True), lexngrams) def train(self, X, y, percentile=0): self.tfidf = self.factory.make_vectorizer() self.clf = self.factory.make_classifier() self.sel_perc = SelectPercentile( mutual_info_classif, percentile) if percentile >= 1 else None vtrain = self.tfidf.fit_transform(X) if self.sel_perc is not None: vtrain = self.sel_perc.fit_transform(vtrain, y) self.clf.fit(vtrain, y) def save(self, clf_name, tfidf_name, sel_perc_name=None): joblib.dump(self.clf, clf_name) joblib.dump(self.tfidf, tfidf_name) if not self.sel_perc is None: joblib.dump(self.sel_perc, sel_perc_name) def load(self, clf_name, tfidf_name, sel_perc_name=None): self.clf = joblib.load(clf_name) self.tfidf = joblib.load(tfidf_name) if not sel_perc_name is None: self.sel_perc = joblib.load(sel_perc_name) def predict(self, raw_strings, predict_proba=False): vtest = self.tfidf.transform(raw_strings) if self.sel_perc is not None: vtest = self.sel_perc.transform(vtest) predictor = self.clf.predict_proba if predict_proba else self.clf.predict y_predicted = predictor(vtest.toarray()) return y_predicted
# Here, the scaling is done properly during the grid search, instead # of the whole training set being used, it uses the part of the # training set it uses for training the different models for cross validation mglearn.plots.plot_proper_processing() plt.show() # Illustration of proper preprocessing rnd = np.random.RandomState(seed=0) X = rnd.normal(size=(100, 10000)) y = rnd.normal(size=(100, )) # To illustrate information leakage, we start with 100 samples # randomly chosen with 10,000 fatures. Because the data is just # noise we should not be able to learn from it select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y) X_selected = select.transform(X) print 'X_selected.shape: {}'.format(X_selected.shape) # first select the most 500 relevant features print 'Cross validation accuracy (cv only on Ridge): {:.3f}'.format( np.mean(cross_val_score(Ridge(), X_selected, y, cv=5))) # The R^2 score is 0.91, this cannot be right since the data is just random. # This is because we did preprocessing on the data outside the cross # validation. pipe = Pipeline([ ('select', SelectPercentile(score_func=f_regression, percentile=5)), ('ridge', Ridge()), ]) print 'Cross validation score accuracy (pipeline): {:.3f}'.format( np.mean(cross_val_score(pipe, X, y, cv=5)))