def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def add_feature_auc(X_train, y_train, X_val, y_val, classifier, cross_val, n_features): """ Purpose: add features to model one at a time (best-to-worst) and calculate performance Inputs: X_train: training features y_train: training labels X_val: validation features y_val: validation labels classifier: sklearn classifier object cross-val: sklearn cross-validation object n_features: the maximum number of features to try """ cv_aucs = pd.Series(index = range(1,n_features+1)) #Empty series for outputs cv_stds = pd.Series(index = range(1,n_features+1)) train_aucs = pd.Series(index = range(1,n_features+1)) val_aucs = pd.Series(index = range(1,n_features+1)) for i in xrange(1,n_features+1): #Iterate over the number of features selector = SelectKBest(score_func = f_classif, k=i) #Initialize selector for the i best features selector = selector.fit(X_train, y_train) #Fit selector X_train_new = pd.DataFrame(selector.transform(X_train)) #Take i best training features for training X_val_new = pd.DataFrame(selector.transform(X_val)) #Take i best training features for validation mean_cv_auc, std_cv_auc, train_auc, val_auc = train_val_auc( #Calculate performance X_train_new, y_train, X_val_new, y_val, classifier, cross_val) cv_aucs[i] = mean_cv_auc #Store performance metrics cv_stds[i] = std_cv_auc train_aucs[i] = train_auc val_aucs[i] = val_auc df = pd.DataFrame(cv_aucs, columns=['cv_auc']) #Create performance dataframe for output df['cv_std'] = cv_stds df['train_auc'] = train_aucs df['val_auc'] = val_aucs return df
def choseFeature(TrainX, TrainY, TestX): cF = SelectKBest(chi2, k=100) cF.fit(TrainX, TrainY) check = cF.get_support() newTrainX = cF.transform(TrainX) newTestX = cF.transform(TestX) return (newTrainX, newTestX)
def feature_selection(X_train,X_test,y_train): ''' Try to select best features using good feature selection methods (chi-square or PMI) or simply you can return train, test if you want to select all features :param X_train: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } :param X_test: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } :param y_train: A dictionary with the following structure { instance_id : sense_id } :return: ''' # implement your code here X_train_new = X_train X_test_new = X_test _y_train = map(lambda key: y_train[key], X_train.keys()) selectorKBest = SelectKBest(chi2, k=int(len( X_train.items()[0][1])*0.90 )).fit( X_train.values(), _y_train) X_train_selected = selectorKBest.transform(X_train.values()) X_test_selected = selectorKBest.transform(X_test.values()) X_train_new = dict(map(lambda index_key: (X_train.keys()[index_key[0]], X_train_selected[index_key[0]]), enumerate(X_train.keys()))) X_test_new = dict(map(lambda index_key: (X_test.keys()[index_key[0]], X_test_selected[index_key[0]]), enumerate(X_test.keys()))) #return X_train_new, X_test_new # or return all feature (no feature selection): return X_train_new, X_test_new
def feature_selection(features_train,labels_train,features_test,K): fs = SelectKBest(f_classif,K) fs.fit(features_train,labels_train) features_train_new = fs.transform(features_train) features_test_new = fs.transform(features_test) return features_train_new,features_test_new
def chisq(train_X, train_y, test_X, kN): start_time = time.time() ch2 = SelectKBest(chi2, k = kN) ch2.fit(train_X, train_y) train_X_ch2 = ch2.transform(train_X) test_X_ch2 = ch2.transform(test_X) print("--- %s seconds ---" % (time.time() - start_time)) return ch2, train_X_ch2, test_X_ch2
def PredictPhysiology_AllFeatures(hdf_location, power_feature_filename, coherence_feature_filename, phys_filename, num_k_best): # Read in behavioral data BlockAB_behavior = StressBehavior(hdf_location) # Get classification of trial types trial_types = np.ravel(BlockAB_behavior.stress_type[BlockAB_behavior.state_time[BlockAB_behavior.ind_check_reward_states]]) # Load physiology data ibi, pupil = TrialClassificationWithPhysiology(phys_filename, trial_types, plot_results = True) phys_mat = np.hstack((ibi,pupil)) # Load neural power features power_mat = dict() coherence_mat = dict() sp.io.loadmat(power_feature_filename, power_mat) power_feat_keys = [key for key in power_mat.keys() if key[0]!='_'] num_chan, num_conditions = power_mat[power_feat_keys[0]].shape num_trials = len(power_feat_keys) power_feat_mat = np.zeros([num_trials,num_chan*num_conditions]) # Create power feature matrix for i, key in enumerate(power_feat_keys): power_feat_mat[i,:] = power_mat[key].flatten() # Create coherence feature matrix sp.io.loadmat(coherence_feature_filename, coherence_mat) coherence_feat_keys = [key for key in coherence_mat.keys() if key[0]!='_'] num_chan_pairs, num_coh_conditions = coherence_mat[coherence_feat_keys[0]].shape coherence_feat_mat = np.zeros([num_trials,num_chan_pairs*num_coh_conditions]) for i, key in enumerate(coherence_feat_keys): coherence_feat_mat[i,:] = coherence_mat[key].flatten() # matrix is in (trials) x (neural_features) feat_mat = np.hstack((power_feat_mat, coherence_feat_mat)) # Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(feat_mat,phys_mat,test_size = 0.9, random_state = 0) linear_regress = linear_model.LinearRegression() linear_regress.fit(X_train,y_train) phys_pred_all = linear_regress.predict(X_test) #phys_pred_all_err = np.linalg.norm(phys_pred_all - y_test) phys_pred_all_err = np.abs(phys_pred_all - y_test) # Select top 3 of original neural features selection_k = SelectKBest(k=num_k_best) selection_k.fit(feat_mat,trial_types) X_selection_k = selection_k.transform(X_train) X_test_selection_k = selection_k.transform(X_test) linear_regress_k = linear_model.LinearRegression() linear_regress_k.fit(X_selection_k,y_train) phys_pred_k = linear_regress_k.predict(X_test_selection_k) #phys_pred_k_err = np.linalg.norm(phys_pred_k - y_test) phys_pred_k_err = np.abs(phys_pred_k - y_test) #plt.scatter(phys_mat[:,0], phys_mat[:,1], marker = 'o', color = 'k', label = 'original') #plt.scatter(phys_pred_k[:,0], phys_pred_k[:,1], marker = 'o', color = 'm', label = 'estimate with K best features') #plt.legend() #plt.show() return linear_regress, phys_pred_k_err, phys_pred_all_err, phys_mat
def kfold_CV(adj_matrix, folds, max_cycle_order, num_features = -1): num_folds = len(folds) accuracy_fold_data = list() false_positive_rate_fold_data = list() time_fold_data = list() for fold_index in range(num_folds): print("Fold %d:" % (fold_index + 1)) #get data train_points = pipeline.join_folds(folds, fold_index) test_points = folds[fold_index] train_test_overlap = False train_row_indices, train_col_indices = zip(*train_points) test_row_indices, test_col_indices = zip(*test_points) train_labels = adj_matrix[train_row_indices, train_col_indices].A[0] #array of signs of training edges test_labels = adj_matrix[test_row_indices, test_col_indices].A[0] #array of signs of test edges #construct matrix using just training edges train_matrix = sp.csr_matrix((train_labels, (train_row_indices, train_col_indices)), shape = adj_matrix.shape) train_matrix = (train_matrix + train_matrix.transpose()).sign() #make symmetric #Compute feature products #This dominates the training time, so report time for only this part for experiments before_train = time.time() feature_products = hoc.extract_edge_features(train_matrix, max_cycle_order) #get features and labels corresponding to each data point train_data = np.asarray([hoc.extract_features_for_edge(feature_products, tr_point) for tr_point in train_points]) test_data = np.asarray([hoc.extract_features_for_edge(feature_products, te_point) for te_point in test_points]) after_train = time.time() model_time = after_train - before_train #if, for experimental reasons, we don't want to train on all the features instead #as a diagnostic for what the model is actually learning and why if num_features > 0: #perform feature selection feat_sel = SelectKBest(f_classif, k=num_features) feat_sel.fit(train_data, train_labels) train_data = feat_sel.transform(train_data) test_data = feat_sel.transform(test_data) elif num_features == 0: #train on random features print "train data: random matrix of shape ", train_data.shape train_data = np.random.random(train_data.shape) #train logistic regression classifier clf = LogisticRegression() clf.fit(train_data, train_labels) #Evaluate test_preds = clf.predict(test_data) acc, fpr = pipeline.evaluate(test_preds, test_labels) accuracy_fold_data.append(acc) false_positive_rate_fold_data.append(fpr) print "HOC feature extraction time for one fold: ", model_time time_fold_data.append(model_time) return accuracy_fold_data, false_positive_rate_fold_data, time_fold_data
class NaiveBayesClassifier(object): ''' classdocs ''' def __init__(self): self.classifier = MultinomialNB() #self.model = None def trainClassifier(self, trainingDocs, labels): self.trainingDocs = trainingDocs self.labels = labels self.count_vect = CountVectorizer(stop_words='english') X_train_counts = self.count_vect.fit_transform(self.trainingDocs) self.tf_transformer = TfidfTransformer(use_idf=True,sublinear_tf=True).fit(X_train_counts) X_train_tf = self.tf_transformer.transform(X_train_counts) self.ch2 = SelectKBest(chi2) X_train = self.ch2.fit_transform(X_train_tf, self.labels) #self.classifier.fit(X_train_tf, self.labels) self.classifier.fit(X_train, self.labels) def classify(self, docs_new): X_new_counts = self.count_vect.transform(docs_new) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) #predicted = self.model.predict(X_new_tfidf) #self.predicted = self.classifier.predict(X_new_tfidf) self.predicted = self.classifier.predict(X_test) #for doc, category in zip(docs_new, self.predicted): # print '%r => %s' % (doc,category) return self.predicted def calculate_score(self, doc_new): doc_list = [doc_new] #doc_list.append(doc_new) X_new_counts = self.count_vect.transform(doc_list) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) self.predicted = self.classifier.predict(X_test) return self.predicted #predicted_prob_all = self.classifier.predict_proba(X_test) #predicted_prob = [max(pr) for pr in predicted_prob_all] #return predicted_prob def score(self,docs_test,labels): X_new_counts = self.count_vect.transform(docs_test) X_new_tfidf = self.tf_transformer.transform(X_new_counts) X_test = self.ch2.transform(X_new_tfidf) #self.predicted = self.classifier.predict(X_new_tfidf) self.predicted = self.classifier.predict(X_test) accuracy = np.mean(self.predicted == labels) #accuracy = self.classifier.score(X_new_tfidf, labels) return accuracy
def select_features(train_X, train_y, test_X, k): if k == 'all': return train_X, test_X selector = SelectKBest(chi2, k=k) selector.fit(train_X, train_y) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
def _select_features(self, n): '''Reduce X to the n best features that represent Y''' logging.info('Reducing X from %d features to %d.' %(self.X.shape[1],n)) if n >= self.X.shape[1]: logging.warn('Number of features is greater than/equal to n.') else: sk = SelectKBest(k=n) sk.fit_transform(self.X[:,1:],self.Y[:,1]) # XXX: This will look ahead to cv/test data sk.transform(self.X_submit[:,1:])
def SelKBest1Final(X_train, X_test, y_train, y_test): fX_train = copy.copy(X_train) fX_test = copy.copy(X_test) fy_train = copy.copy(y_train) fy_test = copy.copy(y_test) skb = SelectKBest(f_classif, k=3) skb.fit(fX_train, fy_train) fX_train = skb.transform(fX_train) fX_test = skb.transform(fX_test) return fX_train, fX_test, fy_train, fy_test
def SelKBest_base_final(X_train, X_test, y_train, y_test, k=10): '''Leave the copying alone here''' fX_train = copy.copy(X_train) fX_test = copy.copy(X_test) fy_train = copy.copy(y_train) fy_test = copy.copy(y_test) skb = SelectKBest(f_classif, k=k) skb.fit(fX_train, fy_train) fX_train = skb.transform(fX_train) fX_test = skb.transform(fX_test) return fX_train, fX_test, fy_train, fy_test
def pre_process_pruned_tfidf(*args): x_train = args[0] y_train = args[1] x_test = args[2] y_test = args[3] x_train = [x.vectorized for x in x_train] x_test = [x.vectorized for x in x_test] k = int(round(K_BEST_RATE * len(x_train[0]), )) k_best = SelectKBest(chi2, k=k) k_best.fit(x_train, y_train) train_transformed = k_best.transform(x_train) test_transformed = k_best.transform(x_test) return train_transformed, y_train, test_transformed, y_test
def feature_selection(X_train,X_test,y_train, language): ''' Try to select best features using good feature selection methods (chi-square or PMI) or simply you can return train, test if you want to select all features :param X_train: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } :param X_test: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } :param y_train: A dictionary with the following structure { instance_id : sense_id } :return: ''' # implement your code here #return X_train_new, X_test_new # or return all feature (no feature selection): if language != 'English': X = [] # our list of word counts for an instance Y = [] # the corresponding sense id for each instance for key, value in X_train.iteritems(): if y_train[key] != 'U': Y.append(y_train[key]) X.append(value) #print len(X), len(X[0]) num_feats = 0.9 * len(X[0]) feature_selector = SelectKBest(chi2, k=num_feats) feature_selector.fit(X, Y) X_train_final = {} X_test_final = {} for instance_id in X_train: X_train_final[instance_id] = feature_selector.transform(X_train[instance_id])[0] #print len(X_train[instance_id]) for instance_id in X_test: X_test_final[instance_id] = feature_selector.transform(X_test[instance_id])[0] return X_train_final, X_test_final else: # given return statement return X_train, X_test
def itemB(): train_dataset = load_nebulosa_train() # remover missing values # print(train_dataset) train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)] train_dataset = train_dataset[:, 2:] train_target = train_dataset[:, -1] train_dataset = train_dataset[:, :-2] # train_dataset = normalize(train_dataset, axis=0) test_dataset = load_nebulosa_test() # remover mising values test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)] test_dataset = test_dataset[:, 2:] test_target = test_dataset[:, -1] test_dataset = test_dataset[:, :-2] # print(test_dataset) # test_dataset = normalize(test_dataset, axis=1) # print(test_dataset) kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target) train_dataset = kbest.transform(train_dataset) test_dataset = kbest.transform(test_dataset) # print(train_dataset) n_train_samples = train_dataset.shape[0] n_train_features = train_dataset.shape[1] # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features)) n_test_samples = test_dataset.shape[0] n_test_features = test_dataset.shape[1] # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features)) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(train_dataset, train_target) nn_target_pred_test = nn.predict(test_dataset) nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test) print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test)) nc = NearestCentroid(metric="euclidean") nc.fit(train_dataset, train_target) nc_target_pred_test = nc.predict(test_dataset) nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test) print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def _SelectKBest(self, X, y): print('Selecting K Best from whole image') from sklearn.feature_selection import SelectKBest, f_classif # ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. The number of features to be selected is set to 784 feature_selection = SelectKBest(f_classif, k=self.k_features) feature_selection.fit(X, y) scores = f_classif(X, y)[0] mask_k_best = np.zeros(scores.shape, dtype=bool) mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\ = 1 import nibabel mask_brain_img = nibabel.load(self.mask_non_brain).get_data() mask_brain = mask_brain_img.flatten().astype(bool) roi = np.zeros(mask_brain.flatten().shape) roi[mask_brain] = mask_k_best roi = roi.reshape(mask_brain_img.shape) img = nibabel.Nifti1Image(roi, np.eye(4)) img.to_filename('/tmp/best.nii.gz') print('SelectKBest data reduction from: %s' % str(X.shape)) X = feature_selection.transform(X) print('SelectKBest data reduction to: %s' % str(X.shape)) self.feature_reduction_method = feature_selection return X
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
class BagOfWords(Feature): def name(self): return "BagOfWords with mn=" + str(self._mn) + ", mx=" + str(self._mx) + ", analyzertype=" + self._analyzertype + ", numFeatures=" + str(self._numFeatures) def __init__(self,numFeatures, mn=1, mx=2, analyzertype='word'): self._tokenizer = Tokenizer() if analyzertype == 'word': self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype) else: self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype) self._initialized = False self._mn = mn self._mx = mx self._analyzertype = analyzertype self._numFeatures = numFeatures self._ch2 = SelectKBest(chi2, k=numFeatures) def extract_all(self, sentences,train,labels): sentences = self.preprocess_all(sentences) if not self._initialized: matrix = self._vectorizer.fit_transform(sentences) self._initialized = True else: matrix = self._vectorizer.transform(sentences) #print matrix.todense() if self._numFeatures < matrix.shape[1]: if train: matrix = self._ch2.fit_transform(matrix, labels) else: matrix = self._ch2.transform(matrix) return matrix
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection): nsample=sparseArrayRowNorm[0].shape[0] scaler = StandardScaler(with_mean=False) #scaler = MinMaxScaler() testsize=int(nsample/nfold) cvIdx=[1]*(nsample-testsize)+[2]*testsize random.seed(100) aucRes=[] for nn in range(nrep): #print nn random.shuffle(cvIdx) Y_train=y_all[np.where(np.array(cvIdx)==1)[0]] Y_test=y_all[np.where(np.array(cvIdx)==2)[0]] X_train_all=[] X_test_all=[] for ii in xrange(len(sparseArrayRowNorm)): varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1])) X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:] X_train =varSelector.fit_transform(X_train, Y_train) X_train_all=X_train_all+[X_train] X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:] X_test= varSelector.transform(X_test) X_test_all=X_test_all+[X_test] X_train=hstack(X_train_all,format='csr') X_test=hstack(X_test_all,format='csr') del X_train_all del X_test_all aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure)) print np.array(aucRes).mean() return np.array(aucRes).mean()
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile # selector = SelectPercentile(f_classif, percentile=30) # selector.fit(features_train_transformed, lables) # selector : SelectKBest selector = SelectKBest(k=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def string_selection(): # get data vectorizer = CountVectorizer(decode_error='ignore') ch2 = SelectKBest(chi2, k=100) # get data train_data, permission_list = db_tool.get_new_train_data() x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'], train_data['target'], test_size=0.2, random_state=1) # feature extraction x_train = vectorizer.fit_transform(x_train) feature_names = vectorizer.get_feature_names() x_train = ch2.fit_transform(x_train, y_train) feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print(ch2.scores_) print(ch2.get_support(indices=True)) print(feature_names) x_test = vectorizer.transform(x_test) x_test = ch2.transform(x_test) # # build the model model = MultinomialNB().fit(x_train, y_train) # # # valid the model predicted = model.predict(x_test) print (metrics.accuracy_score(y_test, predicted))
def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: selector = SelectKBest(chi2, k=10) selector = selector.fit(x, y) selected_features = self.features[selector.get_support()] print(selected_features) x = selector.transform(x) return x
def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def gridSearchCV_test(): ch2 = SelectKBest(chi2, k=20) # get data train_data = db_tool.get_new_train_data() X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'], train_data['target'], test_size=0.2, random_state=1) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) param_grid = [ {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]}, {'alpha': [0, 9, 0.4], 'fit_prior': [True]} ] clf = grid_search.GridSearchCV(MultinomialNB(), param_grid) # # build the model clf.fit(X_train, y_train) print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) predicted = clf.predict(X_test) print (metrics.accuracy_score(y_test, predicted)) print(metrics.classification_report(y_test, predicted))
def try_all_k_best(max=13): for k in range(1,max+1): data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) selector = SelectKBest(k=k) features_train = selector.fit_transform(features_train, labels_train) features_test = selector.transform(features_test) choices.append(selector.transform(np.array(features_list[1:]).reshape(1, -1))) lr_cv = LogisticRegressionCV() lr_cv.fit(features_train, labels_train) pred.append(lr_cv.predict(features_test)) acc.append(accuracy_score(labels_test, pred[k-1])) prec.append(precision_score(labels_test, pred[k-1])) reca.append(recall_score(labels_test, pred[k-1]))
def inner(*args, **kwargs): X, y = func(*args, **kwargs) global q4_slct if q4_slct is None: q4_slct = SelectKBest(k=200).fit(X, y) X = q4_slct.transform(X) return X, y
def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def select_features(X_train, y_train, X_test): fs = SelectKBest(score_func=chi2, k='all') fs.fit(X_train, y_train) X_train_fs = fs.transform(X_train) X_test_fs = fs.transform(X_test) return X_train_fs, X_test_fs, fs
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 chi2_selector = SelectKBest(chi2, k=3000000) X_kbest_train = chi2_selector.fit_transform(x_train_normalized, y_train) from sklearn.svm import SVC clf = SVC(C=1000, kernel='linear', random_state=0) clf.fit(X_kbest_train, np.ravel(y_train)) ########################valid x_val_counts = count_vect.transform(X_val) x_val_tfidf = tfidf_transformer.transform(x_val_counts) x_val_normalized = normalizer_tranformer.transform(x_val_tfidf) X_kbest_val = chi2_selector.transform(x_val_normalized) val_predict_sck_svm = clf.predict(X_kbest_val) np_y_val_predict_sck_svm = np.array(val_predict_sck_svm).reshape( len(val_predict_sck_svm), 1) from sklearn.metrics import accuracy_score accuracy_score(y_val, np_y_val_predict_sck_svm) #######################final train before submission count_vect_final = CountVectorizer(ngram_range=(1, 3)).fit(training_x) x_train_counts_final = count_vect_final.transform(training_x) tfidf_transformer_final = TfidfTransformer().fit(x_train_counts_final) x_train_tfidf_final = tfidf_transformer_final.transform(x_train_counts_final)
def main(): corpus, labels = get_data() # Get data print("data number:", len(labels)) train_corpus, development_corpus, test_corpus, train_labels, development_labels, test_labels = prepare_datasets( corpus, labels) # Divided into training set, development set and test set print('train_corpus number:', len(train_corpus)) print('development_corpus number:', len(development_corpus)) print('test_corpus number:', len(test_corpus)) # FeatureUnion is used for feature extraction. features_select = FeatureUnion([ ('tf', CountVectorizer(ngram_range=(1, 1))), # term frequency ('bow,', CountVectorizer( ngram_range=(2, 2))), # Bag-of-words model(Divide every two words) ('tfidf', TfidfVectorizer()) ]) # TF-IDF # Feature extraction of all data set train_feature = features_select.fit_transform(train_corpus) development_feature = features_select.transform(development_corpus) test_feature = features_select.transform(test_corpus) # SVM Model svm = SGDClassifier(loss='hinge', n_iter_no_change=100) # Feature selection using development set k_list = [500, 1000, 2000, 5000] for k in k_list: fs_sentanalysis = SelectKBest(chi2, k=k).fit(train_feature, train_labels) new_train_feature = fs_sentanalysis.transform(train_feature) new_development_feature = fs_sentanalysis.transform( development_feature) new_test_feature = fs_sentanalysis.transform(test_feature) print('k=', k, "SVM model") svm_model = svm.fit(new_train_feature, train_labels) label_pred = svm_model.predict(new_development_feature) score = classification_report(development_labels, label_pred, target_names=[ 'business', 'entertainment', 'politics', 'sprot', 'tech' ]) print(score) # test model test_k = int( input("Please input the K value with the best training effect:")) print('The score of the model on the test set:') fs_sentanalysis = SelectKBest(chi2, k=test_k).fit(train_feature, train_labels) new_train_feature = fs_sentanalysis.transform(train_feature) new_test_feature = fs_sentanalysis.transform(test_feature) svm_model = svm.fit(new_train_feature, train_labels) label_pred = svm_model.predict(new_test_feature) score_1 = classification_report(test_labels, label_pred, target_names=[ 'business', 'entertainment', 'politics', 'sprot', 'tech' ]) print(score_1)
class Tfidf_transform(pl.Feature_transform): """create TF-IDF (term frequency - inverse document frequency) features. can use chi-squared test to limit features. Assumes string based input feature that can be split. Uses scikit-learn based transformers internally Args: min_df (int): min document frequency (for sklearn vectorizer) max_df (float): max document frequency (for sklearn vectorizer) select_features (bool): use chi-squared test to select features topn_features (int): keep top features from chi-squared test stop_words (str): stop words (for sklearn vectorizer) target_feature (str): target feature for chi-squared test """ def __init__(self, min_df=10, max_df=0.7, select_features=False, topn_features=50000, stop_words=None, target_feature=None): super(Tfidf_transform, self).__init__() self.min_df = min_df self.max_df = max_df self.select_features = select_features self.topn_features = topn_features self.stop_words = stop_words self.target_feature = target_feature self.ch2 = "" self.feature_names_support = [] def getTokens(self, j): """basic method to get "document" string from feature """ if self.input_feature in j: if isinstance(j[self.input_feature], list): return " ".join([ i if isinstance(i, basestring) else str(i) for i in j[self.input_feature] ]) elif isinstance(j[self.input_feature], basestring): return j[self.input_feature] else: return str(j[self.input_feature]) else: return "" def get_models(self): return super(Tfidf_transform, self).get_models() + [ (self.min_df, self.max_df, self.select_features, self.topn_features, self.stop_words, self.target_feature), self.vectorizer, self.tfidf_transformer, self.ch2, self.fnames, self.feature_names_support ] def set_models(self, models): models = super(Tfidf_transform, self).set_models(models) (self.min_df, self.max_df, self.select_features, self.topn_features, self.stop_words, self.target_feature) = models[0] self.vectorizer = models[1] self.tfidf_transformer = models[2] self.ch2 = models[3] self.fnames = models[4] self.feature_names_support = models[5] def fit(self, objs): """fit using sklean transforms vectorizer->tfidf->(optional) chi-squqred test """ docs = [] target = [] self.vectorizer = CountVectorizer(min_df=self.min_df, max_df=self.max_df, stop_words=self.stop_words) self.tfidf_transformer = TfidfTransformer() for j in objs: docs.append(self.getTokens(j)) if self.target_feature: target.append(int(j[self.target_feature])) counts = self.vectorizer.fit_transform(docs) self.tfidf = self.tfidf_transformer.fit_transform(counts) self.fnames = self.vectorizer.get_feature_names() self.logger.info("%s base tfidf features %d", self.get_log_prefix(), len(self.fnames)) if self.select_features: self.ch2 = SelectKBest(chi2, k=self.topn_features) self.ch2.fit_transform(self.tfidf, target) self.feature_names_support = set( [self.fnames[i] for i in self.ch2.get_support(indices=True)]) self.logger.info("%s selected tfidf features %d", self.get_log_prefix(), len(self.feature_names_support)) def transform(self, j): """transform features into final tfidf features """ docs = [] docs.append(self.getTokens(j)) counts = self.vectorizer.transform(docs) self.tfidf = self.tfidf_transformer.transform(counts) if self.select_features: self.ch2.transform(self.tfidf) doc_tfidf = {} for (col, val) in zip(self.tfidf[0].indices, self.tfidf[0].data): fname = self.fnames[col] if self.select_features: if fname in self.feature_names_support: doc_tfidf[fname] = val else: doc_tfidf[fname] = val j[self.output_feature] = doc_tfidf return j
import numpy as np import pandas as pd import scipy.stats as ss from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel df = pd.DataFrame({ 'A': ss.norm.rvs(size=10), 'B': ss.norm.rvs(size=10), 'C': ss.norm.rvs(size=10), 'D': np.random.randint(low=0, high=2, size=10) }) print(df) x = df.loc[:, ['A', 'B', 'C']] y = df.loc[:, 'D'] skb = SelectKBest(k=2) # 采用过滤思想,可以指定方式,默认是采用F-检验(方差检验) skb.fit(x, y) print(skb.transform(x)) rfe = RFE(estimator=SVR(kernel='linear'), n_features_to_select=2, step=1) # 采用包裹思想,指定回归模型,选择后剩余的特征数,以及每次迭代去除的特征量 print(rfe.fit_transform(x, y)) sfm = SelectFromModel(estimator=DecisionTreeRegressor(), threshold=0.1) # 采用嵌入思想,threshold表示特征权重低于多少时就被去除 print(sfm.fit_transform(x, y))
normalization = "minmax" bal = "smote" df = datapp.preprocess(data, to_clf, normalization=normalization, ignore_classes=categoric, as_df=True) df = data y: np.ndarray = df[to_clf].values X: np.ndarray = df.drop(to_clf, axis=1).values #%% select = SelectKBest(f_classif, k=10).fit(X, y) ind = select.get_support(indices=True) col = df.columns[ind].tolist() X_new = select.transform(X) dfk = pd.DataFrame(X_new, columns=col) #%% bins = list(range(3, 12)) qdfs = [] cdfs = [] for b in bins: qdfs.append(eval.cut(dfk, b, ['class', 'id', 'gender'], cut="qcut")) cdfs.append(eval.cut(dfk, b, ['class', 'id', 'gender'], cut="cut")) #%% dummy_qdfs = [] dummy_cdfs = [] for i in range(len(bins)): dummy_qdfs.append(eval.dummy(qdfs[i], ['class', 'id', 'gender'])) dummy_cdfs.append(eval.dummy(cdfs[i], ['class', 'id', 'gender'])) #%%
print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..."
intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=5000, multi_class='ovr', verbose=1, warm_start=False, n_jobs=256) #creat lr stacking features #gender from sklearn.feature_selection import SelectKBest, chi2, f_classif ch2 = SelectKBest(chi2, k=100000) x_train = ch2.fit_transform(X_train_tf, train.gender) x_test = ch2.transform(X_test_tf) print 'creat gender prob features' random_seed = 2016 x = x_train y = [1 if i == 1 else 0 for i in train.gender] skf = StratifiedKFold(y, n_folds=5, shuffle=True) new_train = np.zeros((100000, 1)) new_test = np.zeros((100000, 1)) for i, (trainid, valid) in enumerate(skf): print 'fold' + str(i) train_x = x_train[trainid] train_y = train.gender[trainid] val_x = x_train[valid]
if(row['MemberCardLevel_Medium'] == 1): return 1 elif(row['MemberCardLevel_High'] == 1): return 2 else: return 0 #read data data=[] with open('dataWithLable.pkl', 'rb') as f: data = pickle.load(f) #merge 3 type of carde level X=data X["cardlevel"]=X.apply(search, axis=1) X=data.drop(['is_Churn', 'UUID', 'OnlineMemberId', 'MemberCardLevel_Low', 'MemberCardLevel_Medium', 'MemberCardLevel_High'], axis=1) y=data['is_Churn'] #最高的k個features selector= SelectKBest(k=5) selector.fit(X, y) GetSupport= selector.get_support(True) TransX= selector.transform(X) Scores= selector.scores_ print(Scores)#看全部features分數,順序和column順序相同 print(GetSupport)#有被選到的features的index #有被選到的features for i in range(0, 27): if selector.get_support()[i]: print(X.columns.values.tolist()[i])
train_attrs.append(attr) cnt += 1 del train_attrs[0] # get y_train from train_attrs y_train = [[float(attr)] for attr in train_attrs] # chi-2 select features print "start feature selection" if (SELECTOR == 0): selector = SelectKBest(chi2, k = K_FOR_BEST) else: selector = SelectPercentile(score_func=chi2, percentile=SELECT_PERCENTILE) selector.fit(x_train, y_train) new_x_train = selector.transform(x_train) print "feature selection done" # convert y_train to svm-fit shape y_train = [attr[0] for attr in y_train] new_x_train, new_x_test, new_y_train, new_y_test = cross_validation.train_test_split(new_x_train, y_train, test_size=0.4, random_state=0) # regression # clf = svm.SVR(kernel='rbf', degree=3, gamma=1.9, coef0=0.0, tol=0.001, \ # C=0.13, epsilon=0.1, shrinking=True, probability=False, cache_size=700, \ # verbose=False, max_iter=-1, random_state=None) clf = LinearRegression() clf = clf.fit(new_x_train, new_y_train) # cross validation
#'ONS', 'year', 'admitted', 'All_Under_16', 'All_16_24', 'All_25_34', #'All_35_44', 'All_45_54', 'All_55_64', 'All_65_74', 'All_75_Over', #'income_m', 'income_f', 'ft', 'pt', 'Food', 'all_ethnic', non_food_features = [ 'income', 'cycling', 'fp_rate', 'White', 'Gypsy / Traveller / Irish Traveller', 'Mixed / Multiple Ethnic Groups', 'Asian / Asian British: Indian', 'Asian / Asian British: Pakistani', 'Asian / Asian British: Bangladeshi', 'Asian / Asian British: Chinese', 'Asian / Asian British: Other Asian', 'Black / African / Caribbean / Black British', 'Other Ethnic Group' ] selector = SelectKBest(f_regression, k=7) #Regresion selector selector.fit(df[non_food_features], target) selector.transform(df[non_food_features]) weight = -np.log10(selector.pvalues_) plt.bar(range(len(non_food_features)), weight) plt.xticks(range(len(non_food_features)), non_food_features, rotation="vertical") plt.show() food_feats1 = [ 'Bread, rice and cereals', 'Pasta products', 'Buns, cakes, biscuits etc', 'Pastry (savoury)', 'Beef (fresh, chilled or frozen)', 'Pork (fresh, chilled or frozen)', 'Lamb (fresh, chilled or frozen)', 'Poultry (fresh, chilled or frozen)', 'Bacon and ham', 'Other meat and meat preparations', 'Fish and fish products', 'Milk', 'Cheese and curd', 'Eggs', 'Other milk products', 'Butter', 'Margarine, other vegetable fats and peanut butter',
import pandas as pd from sklearn.feature_selection import SelectKBest, SelectPercentile #Exhaustive list of feature selection we can apply dev = [6, 9, 12, 15, 18, 20, 25, 30] test = pd.read_csv("../input/test_data_processed.csv") train = pd.read_csv("../input/train_data_processed.csv") target = pd.read_csv("../input/target_data_processed.csv") # sp_25 = SelectPercentile(percentile=25) #Can be used # sp_50 = SelectPercentile(percentile=50) #Can be used kbest = SelectKBest(k=100) #RFE can also be used but generally very time consuming when feature size is high kbest.fit(train, target) train_k = kbest.transform(train) test_k = kbest.transform(test) print train_k.shape filename_train_k = "../input/train_k100" + ".csv" filename_test_k = "../input/train_k100" + ".csv" pd.DataFrame(train_k).to_csv(filename_train_k, index=False) pd.DataFrame(test_k).to_csv(filename_test_k, index=False)
trans_features_test = vectorizer.transform(features_test) myprint('Vectorized features') myprint('> ' + str(trans_features_train.shape[0]) + ' abstracts in train set') myprint('> ' + str(trans_features_train.shape[1]) + ' words per abstract in train set') myprint('> ' + str(trans_features_test.shape[0]) + ' abstracts in test set') myprint('> ' + str(trans_features_test.shape[1]) + ' words per abstract in test set') ### Reduce feature dimensionality # Set a sensible upper bound of the number of words that is # required to capture the difference between the two categories feature_dim = 1000 selector = SelectKBest(chi2, k=feature_dim) selector.fit(trans_features_train, labels_train) trans_features_train = selector.transform(trans_features_train).toarray() trans_features_test = selector.transform(trans_features_test).toarray() myprint('Reduced dimensionality') myprint('> ' + str(trans_features_train.shape[1]) + ' words per abstract') ### Classification with support vector machine # Pro: effective in high-dimensional feature spaces (i.e. large dictionary) # # Idea: # Given training vectors x_i in R^p for i=1,...,n in two classes # and a vector y in {-1,1}^n, the goal of SVM is to find w in R^p # and b in R, such that the prediction sign(w^T.phi(x)+b) # is correct for most samples # # Problem: # min_{w,b,z} 1/2 w^T w + C sum_i z_i
#get words selected as feauters feature_names = vectorizer.get_feature_names() n_train = len(X_train) n_test = len(X_test) X_train = X_train[:n_train] y_train = y_train[:n_train] X_test = X_test[:n_test] y_test = y_test[:n_test] from sklearn.feature_selection import SelectKBest, chi2 #feature selection with chi2 statistics ch2 = SelectKBest(chi2, k=7000) X_train_new = ch2.fit_transform(X_train, y_train) X_test_new = ch2.transform(X_test) X_train_new.shape X_test_new.shape feature_names_ch2 = [feature_names_chi2[i] for i in ch2.get_support(indices=True)] #first we do gridsearch over hyperparameters of SVM classifier from sklearn import svm from sklearn.model_selection import GridSearchCV #the grids of parameters parameters = {'kernel':['rbf'], 'C':[10, 100], 'gamma': [0.5, 0.75, 1]} svc = svm.SVC(gamma="scale") #define classifier clf = GridSearchCV(svc, parameters, cv=5, n_jobs=4, verbose=True) clf.fit(X_train, y_train)
y_cur = train_finalMerged[['Activity']].values.ravel() X_cur = train_finalMerged[['MeanSM','StDevSM','MdnSM', 'belowPer25SM','belowPer75SM', 'TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15','MeanSM_s2','StDevSM_s2','MdnSM_s2', 'belowPer25SM_s2','belowPer75SM_s2', 'TotPower_0.3_15_s2','FirsDomFre_0.3_15_s2','PowFirsDomFre_0.3_15_s2','SecDomFre_0.3_15_s2','PowSecDomFre_0.3_15_s2','FirsDomFre_0.6_2.5_s2','PowFirsDomFre_0.6_2.5_s2','FirsDomFre_per_TotPower_0.3_15_s2']] if First: X_train = X_cur y_train = y_cur First = False else: X_train = np.concatenate((X_train,X_cur),axis=0) y_train = np.concatenate((y_train,y_cur),axis=0) # print(X_test.shape,y_test.shape, X_train.shape, y_train.shape) # print (siteStr,X_test) # feature selection X_train_red = selector.fit_transform(X_train,y_train) X_test_red = selector.transform(X_test) # rfc = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion='gini',oob_score = False) #rfc = OneVsRestClassifier(SVC(kernel='rbf',C=100,gamma=0.1)) # rfc = SVC(kernel='rbf', gamma=0.7,C=1,random_state=10) rfc = KNeighborsClassifier(n_neighbors=11,algorithm='auto') rfc.fit(X_train_red, y_train) y_pred = rfc.predict(X_test_red) cm = confusion_matrix(y_test, y_pred, labels = activities) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm_comb[count,:,:] = cm_normalized pre_rec_fscore = precision_recall_fscore_support(y_test, y_pred, average=None,labels=activities) tmp[count,0,:],tmp[count,1,:],tmp[count,2,:] = pre_rec_fscore[0],pre_rec_fscore[1],pre_rec_fscore[2] count = count + 1
class FBCSP(): ''' Two- or multi-class FBCSP ''' def __init__(self, bands, smp_freq, num_class=2, order=5, n_components=4, n_features=8): self.bands = bands self.smp_freq = smp_freq self.n_components = n_components self.n_features = n_features self.cov_type = 'epoch' self.order = order self.n_classes = num_class self.classes = None if self.n_classes == 2: self.component_order = 'alternate' else: self.component_order = 'mutual_info' self.norm_trace = False self.spatial_transform = [None for i in range(len(bands))] self.selector = SelectKBest(score_func=mutual_info_classif, k=self.n_features) def __covariance(self, X): # covariance = np.dot(X, X.T)/X.shape[0] # return covariance return np.dot(X,X.T)/np.trace(np.dot(X,X.T)) # spatialFilter returns the spatial filter for mean covariance matrices of two classes def __get_spatial_filter(self, covs, sample_weights): eigen_vectors, eigen_values = self.__decompose_covs(covs, sample_weights) # CSP requires the eigenvalues and eigenvector be sorted in descending order sorted_index = self.__order_components(covs, sample_weights, eigen_vectors, eigen_values) eigen_vectors = eigen_vectors[:, sorted_index] sp_filters = eigen_vectors.T pick_filters = sp_filters[:self.n_components] return pick_filters, sp_filters def __decompose_covs(self, covs, sample_weights): n_classes = len(covs) if n_classes == 2: eigen_values, eigen_vectors = linalg.eigh(covs[0], covs.sum(0)) else: # The multiclass case is adapted from # http://github.com/alexandrebarachant/pyRiemann eigen_vectors, D = self.__ajd_pham(covs) eigen_vectors = self.__normalize_eigenvectors(eigen_vectors.T, covs, sample_weights) eigen_values = None return eigen_vectors, eigen_values def __calculate_covariance_matrices(self, data, y_class): '''The data is in the form of samples x channels x sampled_time_points''' if len(data.shape) != 3: raise Exception('Dimension is not match!') n_samples, n_channels, n_points = data.shape if self.cov_type == 'concat': cov_estimator = self.__concat_cov elif self.cov_type == 'epoch': cov_estimator = self.__epoch_cov covs = [] sample_weights = [] self.classes = np.unique(y_class) n_classes = len(self.classes) for id_class in self.classes: cov, weight = cov_estimator(data[y_class == id_class]) if self.norm_trace: cov /= np.trace(cov) covs.append(cov) sample_weights.append(weight) return np.stack(covs), np.array(sample_weights) def __concat_cov(self, X_class): '''The data is in the form of samples x channels x sampled_time_points''' '''Concatenate epochs before computing the covariance.''' n_samples, n_channels, n_points = X_class.shape X_class = np.transpose(X_class, [1, 0, 2]) X_class = X_class.reshape(n_channels, -1) # The covariace with norm trace is used to provide better results cov = self.__covariance(X_class) # cov = np.cov(X_class) weight = X_class.shape[0] return cov, weight def __epoch_cov(self, X_class): '''The data is in the form of samples x channels x sampled_time_points''' '''Mean of per-epoch covariances.''' cov = sum(self.__covariance(data) for data in X_class) cov /= len(X_class) weight = len(X_class) return cov, weight def __compute_mutual_info(self, covs, sample_weights, eigen_vectors): class_probas = sample_weights / sample_weights.sum() mutual_info = [] for jj in range(eigen_vectors.shape[1]): aa, bb = 0, 0 for (cov, prob) in zip(covs, class_probas): tmp = np.dot(np.dot(eigen_vectors[:, jj].T, cov), eigen_vectors[:, jj]) aa += prob * np.log(np.sqrt(tmp)) bb += prob * (tmp ** 2 - 1) mi = - (aa + (3.0 / 16) * (bb ** 2)) mutual_info.append(mi) return mutual_info def __normalize_eigenvectors(self, eigen_vectors, covs, sample_weights): # Here we apply an euclidean mean. See pyRiemann for other metrics mean_cov = np.average(covs, axis=0, weights=sample_weights) for ii in range(eigen_vectors.shape[1]): tmp = np.dot(np.dot(eigen_vectors[:, ii].T, mean_cov), eigen_vectors[:, ii]) eigen_vectors[:, ii] /= np.sqrt(tmp) return eigen_vectors def __order_components(self, covs, sample_weights, eigen_vectors, eigen_values): n_classes = len(self.classes) ix = [] if self.component_order == 'mutual_info' and n_classes > 2: mutual_info = self.__compute_mutual_info(covs, sample_weights, eigen_vectors) ix = np.argsort(mutual_info)[::-1] elif self.component_order == 'mutual_info' and n_classes == 2: ix = np.argsort(np.abs(eigen_values - 0.5))[::-1] elif self.component_order == 'alternate' and n_classes == 2: i = np.argsort(eigen_values) ix = np.empty_like(i) ix[1::2] = i[:len(i) // 2] ix[0::2] = i[len(i) // 2:][::-1] return ix def __ajd_pham(self, X, eps=1e-6, max_iter=15): '''Approximate joint diagonalization based on Pham's algorithm. This is a direct implementation of the PHAM's AJD algorithm [1]. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_channels) A set of covariance matrices to diagonalize. eps : float, default 1e-6 The tolerance for stopping criterion. max_iter : int, default 1000 The maximum number of iteration to reach convergence. Returns ------- V : ndarray, shape (n_channels, n_channels) The diagonalizer. D : ndarray, shape (n_epochs, n_channels, n_channels) The set of quasi diagonal matrices. References ---------- .. [1] Pham, Dinh Tuan. 'Joint approximate diagonalization of positive definite Hermitian matrices.' SIAM Journal on Matrix Analysis and Applications 22, no. 4 (2001): 1136-1152. ''' # Adapted from http://github.com/alexandrebarachant/pyRiemann n_epochs = X.shape[0] # Reshape input matrix A = np.concatenate(X, axis=0).T # Init variables n_times, n_m = A.shape V = np.eye(n_times) epsilon = n_times * (n_times - 1) * eps for it in range(max_iter): decr = 0 for ii in range(1, n_times): for jj in range(ii): Ii = np.arange(ii, n_m, n_times) Ij = np.arange(jj, n_m, n_times) c1 = A[ii, Ii] c2 = A[jj, Ij] g12 = np.mean(A[ii, Ij] / c1) g21 = np.mean(A[ii, Ij] / c2) omega21 = np.mean(c1 / c2) omega12 = np.mean(c2 / c1) omega = np.sqrt(omega12 * omega21) tmp = np.sqrt(omega21 / omega12) tmp1 = (tmp * g12 + g21) / (omega + 1) tmp2 = (tmp * g12 - g21) / max(omega - 1, 1e-9) h12 = tmp1 + tmp2 h21 = np.conj((tmp1 - tmp2) / tmp) decr += n_epochs * (g12 * np.conj(h12) + g21 * h21) / 2.0 tmp = 1 + 1.j * 0.5 * np.imag(h12 * h21) tmp = np.real(tmp + np.sqrt(tmp ** 2 - h12 * h21)) tau = np.array([[1, -h12 / tmp], [-h21 / tmp, 1]]) A[[ii, jj], :] = np.dot(tau, A[[ii, jj], :]) tmp = np.c_[A[:, Ii], A[:, Ij]] tmp = np.reshape(tmp, (n_times * n_epochs, 2), order='F') tmp = np.dot(tmp, tau.T) tmp = np.reshape(tmp, (n_times, n_epochs * 2), order='F') A[:, Ii] = tmp[:, :n_epochs] A[:, Ij] = tmp[:, n_epochs:] V[[ii, jj], :] = np.dot(tau, V[[ii, jj], :]) if decr < epsilon: break D = np.reshape(A, (n_times, -1, n_times)).transpose(1, 0, 2) return V, D def __get_log_var_feats(self, spatial_filt, data): data_dot = np.dot(spatial_filt, data) # spatially filtered signals data_var = np.var(data_dot, axis=1) # data_var = (data_dot**2).mean(axis=1) # We use log(var) instead of averaging data_log = np.log(data_var) return data_log def fit_transform(self, X, y): if len(X.shape) != 3: raise Exception('Dimension is not match!') n_samples = X.shape[0] X_transformed_var = np.zeros((len(self.bands), n_samples, self.n_components)) for id_band, freq_band in enumerate(self.bands): # Compute band-pass filter of EEG signals X_filtered = butter_bandpass_filter(X, freq_band[0], freq_band[1], self.smp_freq, self.order) # Calculating covariance only on training set covs, sample_weights = self.__calculate_covariance_matrices(X_filtered, y) spf_sel, spf_org = self.__get_spatial_filter(covs, sample_weights) self.spatial_transform[id_band] = spf_sel # Calculate the variance of spatially filtered signals and then compute the logarithm for sample in range(X_filtered.shape[0]): X_transformed_var[id_band, sample] = self.__get_log_var_feats(self.spatial_transform[id_band], X_filtered[sample,:,:]) X_transformed_var = np.swapaxes(X_transformed_var, 0, 1) X_transformed_var = X_transformed_var.reshape(n_samples, -1) # select k best X_fbcsp = self.selector.fit_transform(X_transformed_var, y) return X_fbcsp def transform(self, X): if len(X.shape) != 3: raise Exception('Dimension is not match!') n_samples = X.shape[0] X_transformed_var = np.zeros((len(self.bands), n_samples, self.n_components)) for id_band, freq_band in enumerate(self.bands): X_filtered = butter_bandpass_filter(X, freq_band[0], freq_band[1], self.smp_freq, self.order) # Calculate the variance of spatially filtered signals and then compute the logarithm for sample_te in range(X_filtered.shape[0]): X_transformed_var[id_band, sample_te] = self.__get_log_var_feats(self.spatial_transform[id_band], X_filtered[sample_te,:,:]) X_transformed_var = np.swapaxes(X_transformed_var, 0, 1) X_transformed_var = X_transformed_var.reshape(n_samples, -1) # select k best X_fbcsp = self.selector.transform(X_transformed_var) return X_fbcsp
y = lrdf_condensed['draft_overall'].astype(float) from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression import matplotlib.pyplot as plt X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) fs = SelectKBest(score_func=f_regression, k='all') fs.fit(X_train, y_train) X_train_fs = fs.transform(X_train) X_test_fs = fs.transform(X_test) scores = pd.DataFrame(columns=['col', 'score']) for i in range(len(fs.scores_)): col = X_train.columns[i] score = fs.scores_[i] scores = scores.append({'col': col, 'score': score}, ignore_index=True) plt.bar(scores['col'], scores['score']) plt.show() scores = scores.sort_values(by='score', ascending=False) # The information yielded by this cell gives us feature selection from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingClassifier from itertools import product from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB iris = load_iris() X = iris.data y = iris.target feature_names = iris.feature_names # XSepalLength = X[:,0] # YSepalWidth = X[:,1] skb = SelectKBest(k='all') skb.fit(X, y) X_vec = skb.transform(X) from sklearn.feature_selection import mutual_info_classif feature_scores = mutual_info_classif(X_vec, y) print 'Dwa najlepsze atrybuty to {0}, {1}'.format( *sorted(zip(feature_scores, feature_names), reverse=True)) # /\ Odpowiedź dla zadania pierwszego | zadanie drugie \/ """ + Załaduj zbiór danych __iris__ korzystając z funkcji [load_iris](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html) + Korzystając z funkcji [SelectKBest](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html) oraz kryterium [mutual_info_classif](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif) wybierz najlepsze __dwa__ atrybuty """ X = iris.data[:, [0, 2]] y = iris.target
X, X_t, y, y_t = train_test_split(X, y, test_size=0.4, random_state=42) ############################################################################### # # # SELECT K BEST # # # ############################################################################### from sklearn.feature_selection import SelectKBest, f_classif # ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. We set the number of features to be selected to 500 feature_selection = SelectKBest(f_classif, k=784) # transform datasets from high import dimensional to k-dimensional X = feature_selection.fit_transform(X, y) X_t = feature_selection.transform(X_t) # save output to csv files import csv with open('train.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for i in range(len(X)): a.writerow([y[i]] + list(X[i])) with open('test.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for i in range(len(X_t)): a.writerow([y[i]] + list(X_t[i]))
def train_model(config, feature_vectors, labels, classifier_model = 'random_forest', scale = True, normalize = False, kBest = False): if(config.has("model")): classifier_model = config.get("model") scale = config.get("scale") normalize = config.get("normalize") kBest = config.get("k_best") print classifier_model print 'Scale:', print scale print 'Normalize:', print normalize print 'K-Best', print kBest classifier = dict() if(scale): scaler = StandardScaler() scaler.fit(feature_vectors) feature_vectors = scaler.transform(feature_vectors) classifier['scaler'] = scaler if(normalize): normalizer = Normalizer() normalizer.fit(feature_vectors) feature_vectors = normalizer.transform(feature_vectors) classifier['normalizer'] = normalizer if(kBest): kBest = SelectKBest(f_classif, k=20) kBest = kBest.fit(feature_vectors, labels) feature_vectors = kBest.transform(feature_vectors) classifier['k_best'] = kBest #print feature_vectors.shape if classifier_model == 'random_forest': model = RandomForestClassifier() model.fit(feature_vectors, labels) elif classifier_model == 'knn': k = 3 model = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform') model.fit(feature_vectors, labels) elif classifier_model == 'logistic_regression': model = LogisticRegression() model.fit(feature_vectors, labels) elif classifier_model == 'svm': model = svm.LinearSVC() model.fit(feature_vectors, labels) elif classifier_model == 'sgd': model = SGDClassifier(loss="modified_huber", penalty="l1") model.fit(feature_vectors, labels) elif classifier_model == 'nn': model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) model.fit(feature_vectors, labels) elif classifier_model == 'dtree': model = tree.DecisionTreeClassifier() model.fit(feature_vectors, labels) elif classifier_model == 'gaussianNB': model = GaussianNB() model.fit(feature_vectors, labels) classifier['model'] = model return classifier
div = Div(text="""Your <a href="https://en.wikipedia.org/wiki/HTML">HTML</a>-supported text is initialized with the <b>text</b> argument. The remaining div arguments are <b>width</b> and <b>height</b>. For this example, those values are <i>200</i> and <i>100</i> respectively.""", width=200, height=100) ''' stats = Paragraph(text='', width=800, height=200, name='Selected Features:') #columns =['avg_dist', 'avg_rating_by_driver','avg_rating_of_driver','avg_surge','surge_pct','trips_in_first_30_days','luxury_car_user','weekday_pct','city_Astapor',"city_KingsLanding",'city_Winterfell','phone_Android','phone_no_phone'] #columns = ['luxury_car_user','avg_dist','city_Astapor',"city_KingsLanding",'phone_Android','phone_iPhone'] #df1 = pd.DataFrame(df, columns=columns) #y = df['churn'] y = df[df.columns[:1]].values.ravel() df1 = df.drop(df.columns[:1], axis=1) selector = SelectKBest(chi2, k=5).fit(df1, y) X_new = selector.transform(df1) mask = selector.get_support() #list of booleans new_features = [] # The list of your K best features for bool, feature in zip(mask, df.columns[1:].tolist()): if bool: new_features.append(feature) #print(new_features) stats.text = str(new_features) x_train_original, x_test_original, y_train_original, y_test_original = train_test_split( X_new, y, test_size=0.25) #For standardizing data
def feature_analysis(X_train, X_test, y_train, y_test, i, X_1k, y_1k): ''' Select the K best features from the corresponding model Parameters: X_train: NumPy array, with the selected training features X_test: NumPy array, with the selected testing features y_train: NumPy array, with the selected training classes y_test: NumPy array, with the selected testing classes i: int, the index of the supposed best classifier (from task 3.1) X_1k: numPy array, just 1K rows of X_train (from task 3.2) y_1k: numPy array, just 1K rows of y_train (from task 3.2) ''' k_list = [5, 10, 20, 30, 40, 50] best_k_1 = [] best_k_32 = [] X_new_1k = [] X_new_32k = [] acc_list = [] if i == 1: classifier = SVC(kernel='linear', max_iter=1000) elif i == 2: classifier = SVC(kernel='rbf', gamma=2.0, max_iter=1000) elif i == 3: classifier = RandomForestClassifier(n_estimators=10, max_depth=5) elif i == 4: classifier = MLPClassifier(alpha=0.05) else: classifier = AdaBoostClassifier() for k in k_list: curr = [k] selector = SelectKBest(f_classif, k=k) Xk_1k = selector.fit_transform(X_1k, y_1k) pp = selector.pvalues_.argsort()[:k] curr.extend(pp) best_k_1.append(curr) if k == 5: classifier.fit(Xk_1k, y_1k) predictor = classifier.predict(selector.transform(X_test)) confusion = confusion_matrix(y_test, predictor) acc_list.append(accuracy(confusion)) for k in k_list: curr = [k] selector = SelectKBest(f_classif, k=k) Xk_32k = selector.fit_transform(X_train, y_train) pp = selector.pvalues_.argsort()[:k] curr.extend(pp) best_k_32.append(curr) if k == 5: classifier.fit(Xk_32k, y_train) predictor = classifier.predict(selector.transform(X_test)) confusion = confusion_matrix(y_test, predictor) acc_list.append(accuracy(confusion)) with open('a1_3.3.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') for i in best_k_32: writer.writerow(i) writer.writerow(acc_list) csvfile.close()
for N in range(1,MAX_NGRAM+1): grams = [terms[j:j+N] for j in range(len(terms)-N+1)] for gram in grams: phrase = " ".join(gram) if phrase in phrase_bit_nr: X_test[i,phrase_bit_nr[phrase]] = 1 Y_test[i] = test_y[i] print("Selecting features...") SKB = SelectKBest(chi2, k=FEATURES) SKB.fit(X_train, Y_train) selected_features = SKB.get_support(indices=True) X_train = SKB.transform(X_train) X_test = SKB.transform(X_test) tm1 = MultiClassTsetlinMachine(c, T*100, s, clause_drop_p=drop_clause, number_of_state_bits=number_of_state_bits, number_of_gpus=n_gpus) f = open("imdb_weighted_%.1f_%d_%d_%.2f_%d_aug.txt" % (s, clauses, T, drop_clause, number_of_state_bits), "w+") r_25 = 0 max = 0.0 for i in range(config.stop_train): start_training = time() tm1.fit(X_train, Y_train, epochs=1, incremental=True) stop_training = time() start_testing = time()
class EnsembleSVM: def __init__(self, n_estimators=50, max_samples=1000, max_features=2000, n_randomized_search_iter=20, random_state=123): random.seed(random_state) self.random_state = random_state self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.n_randomized_search_iter = n_randomized_search_iter def _prepare_classifier(self, params, n_jobs=1): X_train, y_train = params tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1e+0, 1e+1, 1e+2, 1e+3, 1e+4], 'C': [1e+0, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9] }] clf = RandomizedSearchCV(svm.SVC(random_state=self.random_state), tuned_parameters[0], n_iter=self.n_randomized_search_iter, n_jobs=n_jobs, random_state=self.random_state) clf.fit(X_train, y_train) params = clf.best_params_ clf = svm.SVC(kernel=params['kernel'], C=params['C'], gamma=params['gamma'], probability=True, random_state=self.random_state) clf.fit(X_train, y_train) return clf def fit(self, X, y): self.selector = SelectKBest(f_classif, k=self.max_features) self.selector.fit(X, y) X_train = self.selector.transform(X) y_train = y param_list = [] idx = range(len(y_train)) for i in range(self.n_estimators): random.shuffle(idx) param_list.append((X_train[idx[:self.max_samples]], y_train[idx[:self.max_samples]])) pool = ThreadPool(cpu_count()) self.clf_list = pool.map(self._prepare_classifier, param_list) pool.close() pool.join() """ X2=[] for clf in self.clf_list: P=clf.predict_proba(X_train) if len(X2)==0: X2=P[:, 0] else: X2=numpy.vstack((X2, P[:, 0])) X2=numpy.swapaxes(X2, 0, 1) print "X2:", X2.shape from sklearn.ensemble import RandomForestClassifier self.clf2=RandomForestClassifier(n_estimators=100) self.clf2.fit(X2, y_train) """ def predict_proba(self, X): y_pred = self._predict_cover_proba(X) return [[float(x) / 100, 1 - float(x) / 100] for x in y_pred] def _predict_cover_proba(self, X): X_val = self.selector.transform(X) y_val_pred = [0] * len(X_val) for clf in self.clf_list: P = clf.predict_proba(X_val) for i in range(len(P)): y_val_pred[i] += P[i][0] return y_val_pred """ X2=[] Xt=self.selector.transform(X) for clf in self.clf_list: P=clf.predict_proba(Xt) if len(X2)==0: X2=P[:, 0] else: X2=numpy.vstack((X2, P[:, 0])) X2=numpy.swapaxes(X2, 0, 1) print "X2 predict:", X2.shape return self.clf2.predict_proba(X2)[:,0] """ def score(self, X, y): y_pred = self._predict_cover_proba(X) ok = 0 for i in range(len(y)): p = float(y_pred[i]) / len(self.clf_list) if p > 0.5 and y[i] == 0: ok += 1 elif p <= 0.5 and y[i] == 1: ok += 1 return float(ok) / len(y)
# Create a scores array to get the individual categorical column. # Example: # data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical', # 'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States'] # scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # # Returns: [['State-gov']] # Build the scores array. scores = [0] * len(COLUMNS[:-1]) # This column is the categorical column we want to extract. scores[i] = 1 skb = SelectKBest(k=1) skb.scores_ = scores # Convert the categorical column to a numerical value lbn = LabelBinarizer() r = skb.transform(train_features) lbn.fit(r) # Create the pipeline to extract the categorical feature categorical_pipelines.append(('categorical-{}'.format(i), Pipeline([('SKB-{}'.format(i), skb), ('LBN-{}'.format(i), lbn)]))) # [END categorical-feature-conversion] # [START create-pipeline] # Create pipeline to extract the numerical features skb = SelectKBest(k=6) # From COLUMNS use the features that are numerical skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0] categorical_pipelines.append(('numerical', skb)) # Combine all the features using FeatureUnion
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) trn_term_doc = vec.fit_transform(train[column]) test_term_doc = vec.transform(test[column]) train_x = trn_term_doc.tocsr() test_x = test_term_doc.tocsr() train_y = (train["class"] - 1).astype(int) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 model1 = SelectKBest(chi2, k=10000) train_x = model1.fit_transform(train_x, train_y) test_x = model1.transform(test_x) ################################# def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1): train = np.zeros((train_x.shape[0], class_num)) test = np.zeros((test_x.shape[0], class_num)) test_pre = np.zeros((folds, test_x.shape[0], class_num)) cv_scores = [] for i, (train_index, test_index) in enumerate(kf): tr_x = train_x[train_index] tr_y = train_y[train_index] te_x = train_x[test_index] te_y = train_y[test_index]
pipeline = sklearn.pipeline.Pipeline(steps) cv = GridSearchCV(pipeline, param_grid=parameters) cv.fit(X_train, y_train) y_predictions = cv.predict(X_test) report = classification_report(y_test, y_predictions) # Print out the best value for the Decision tree classifier print(report) print(cv.best_params_) # select features k=10 feature_selector = SelectKBest(k=k) X_train = feature_selector.fit_transform(X_train, y_train) X_test = feature_selector.transform(X_test) #feature_names = [ feature_names[i] for i in feature_selector.get_support(indices=True) ] print("features selected: %d" % X_train.shape[1]) # train decision tree dt = DTC(min_samples_split=2) dt.fit(X_train, y_train) # report accuracy print("the decision tree has %d nodes" % dt.tree_.node_count) print("train accuracy: %f" % dt.score(X_train, y_train)) print("test accuracy: %f" % dt.score(X_test, y_test)) ############################# k nearest neighbor #############################
# 卡方检验 #经典的卡方检验是检验定性自变量对定性因变量的相关性。 # 卡方检验计算: # 假设有两个分类变量X和Y,它们的值域分别为{x1, x2}和{y1, y2},其样本频数列联表为: # 参考图示(独立样本四格表): # 年龄 消费 # 少年 高 # 大叔 少 # y1 y2 总计 # x1 a b a+b # x2 c d c+d # 总计 a+c b+d a+b+c+d # 若要推断的论述为H1:“X与Y有关系”,可以利用独立性检验来考察两个变量是否有关系,并且能较精确地给出这种判断的可靠程度。 # 具体的做法是,由表中的数据算出随机变量K^2的值(即K的平方) # K^2 = n (ad - bc) ^ 2 / [(a+b)(c+d)(a+c)(b+d)] 其中n=a+b+c+d为样本容量 # K^2的值越大,说明“X与Y有关系”成立的可能性越大。 from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 #选择K个最好的特征,返回选择特征后的数据 from sklearn.datasets import load_iris iris = load_iris() # k Number of top features to select. The “all” option bypasses selection, for use in a parameter search. selector = SelectKBest(chi2, k=4).fit(iris.data, iris.target) data = selector.transform(iris.data) print(data) print(selector.scores_)
print("n_samples: %d, n_features: %d" % X_test_counts.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train_counts, y_train) X_test = ch2.transform(X_test_counts) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print(feature_names) print(ch2.scores_) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s):
l_svm_score = [] l_nb_score = [] l_blend_score = [] alpha = 1. beta = 20. #sss = StratifiedShuffleSplit(Y, 5, test_size=0.2, random_state=0) sss = KFold(len(Y), n_folds=5, shuffle=True) kbest = SelectKBest(chi2, k=300000) for train_idx, val_idx in sss: x_train, y_train, x_val, y_val = X[train_idx], Y[train_idx], X[val_idx], Y[ val_idx] x_train = kbest.fit_transform(x_train, y_train) x_val = kbest.transform(x_val) # clf_svm.fit(x_train, y_train) svm_predict_proba = getProbaSVM(clf_svm.decision_function(x_val)) score_svm = accuracy_score(clf_svm.predict(x_val), y_val) l_svm_score.append(score_svm) print "l_svm_score" # clf_nb.fit(x_train, y_train) nb_predict_proba = clf_nb.predict_proba(x_val) score_nb = accuracy_score(clf_nb.predict(x_val), y_val) l_nb_score.append(score_nb) print "l_nb_score" # blend_mat = alpha * nb_predict_proba + beta * svm_predict_proba y_pred_blend = clf_svm.classes_[np.argmax(blend_mat, 1)]