train_reader = csv.reader(train_csv) cnt = 0 for tweet in train_reader: attr = tweet[CURRENT_ATTRIBUTE + 4] train_attrs.append(attr) cnt += 1 del train_attrs[0] # get y_train from train_attrs y_train = [[float(attr)] for attr in train_attrs] # chi-2 select features print "start feature selection" if (SELECTOR == 0): selector = SelectKBest(chi2, k=K_FOR_BEST) else: selector = SelectPercentile(score_func=chi2, percentile=SELECT_PERCENTILE) selector.fit(x_train, y_train) new_x_train = selector.transform(x_train) new_x_test = selector.transform(x_test) print "feature selection done" # convert y_train to right dimension y_train = [attr[0] for attr in y_train] # regression print "start regression" clf = LinearRegression() clf = clf.fit(new_x_train, y_train) result = clf.predict(new_x_test) print "regression done" for item in result:
# Some noisy data not correlated E = np.random.uniform(0, 0.1, size=(len(iris.data), 20)) # Add the noisy data to the informative features X = np.hstack((iris.data, E)) y = iris.target plt.figure(1) plt.clf() X_indices = np.arange(X.shape[-1]) # ############################################################################# # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='darkorange', edgecolor='black') # ############################################################################# # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y)
from sklearn.preprocessing import MinMaxScaler from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.4429522712567806 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=92), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant",
def multilabel2(weather,weatherTest): X = weather.frase y = weather[["clase1", "clase2"]] y = y.replace(np.nan, '', regex=True) X_train = weather.iloc [:, [0]] y_train = weather.iloc [:, [1,2]] y_train =y_train.replace(np.nan, '', regex=True) X_Test = weatherTest.iloc [:, [0]] y_test = weatherTest.iloc [:, [1,2]] y_test =y_test.replace(np.nan, '', regex=True) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_Test) y_test = np.array(y_test) pipeline = Pipeline([ ('vectorize', CountVectorizer()), ('tf_idf', TfidfTransformer(norm='l2')), # play with the parameters and check the model size ('select', SelectPercentile(chi2, percentile=50)), ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber'))) ]) ''' scores = [] kf = KFold(n_splits=10, random_state=0, shuffle=True) for train, test in kf.split(total): X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] print(X_test) pipeline.fit(X_train, y_train) predicted = pipeline.predict(X_test) scores.append(evaluacion(y_test, predicted)) ''' mlb = MultiLabelBinarizer() y_train = mlb.fit_transform(y_train) y_test = mlb.transform(y_test) aux = [] for test in X_test: aux.append(test[0]) X_test = aux aux = [] for train in X_train: aux.append(train[0]) X_train = aux #Name: frase, dtype: object print(len(X)) print(y_train.shape) #print(X) print(X_train) pipeline.fit(X_train, y_train) #print(X_test) predicted = pipeline.predict(X_test) print("predicte") # print(predicted) recall = metrics.recall_score(y_test, predicted, average='macro') print("Recall: %f" % recall) precision = metrics.precision_score(y_test, predicted, average='macro') print("Precision: %f" % precision) f1_score = metrics.f1_score(y_test, predicted, average='macro') print("F1-score: %f" % f1_score) accuracy = metrics.accuracy_score(y_test, predicted) print("accuracy: %f" % accuracy) return recall, precision, f1_score, accuracy
##svc=SVC(kernel='linear') #feature_selection=SelectPercentile(f_classif,percentile=10) #parameters_for_svm={'kernel':('linear','rbf','sigmoid'),'C':[1,10,100]} # #svr=SVC(cache_size=800) #clf=GridSearchCV(svr,parameters_for_svm) #pipe_lr=Pipeline([("normalization",normalization), # ("feature_selection",feature_selection), # ("clf",clf)]) #pipe_lr.fit(X_train,y_train) #score=pipe_lr.score(X_test,y_test) """ kfold test """ normalization = StandardScaler() feature_selection = SelectPercentile(f_classif, percentile=10) classify = SVC(C=1, cache_size=800, kernel='linear') pipeline = Pipeline([("normalization", normalization), ("feature_selecation", feature_selection), ("classify", classify)]) kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1) scores = [] fpr = dict() tpr = dict() feature_selected = dict() support_vector = dict() roc_auc = dict() for k, (train, test) in enumerate(kfold): pipeline.fit(X_train[train], y_train[train]) score = pipeline.score(X_train[test], y_train[test])
def predictNConPR(): #feature selection X, y, vectorizer = get_X_y() #selector = SelectKBest(f_classif,500) selector = SelectPercentile(f_classif, percentile=100) selector.fit(X, y) best_indices = selector.get_support(indices=True) best_features = np.array(vectorizer.get_feature_names())[best_indices] X = selector.transform(X) #use cross validation to choose the best parameter lr = LogisticRegression(penalty="l2", fit_intercept=True, class_weight='auto') kf = StratifiedKFold(y, n_folds=5, shuffle=True) parameters = {"C": [1.0, .1, .01, .001, 0.0001]} clf0 = GridSearchCV(lr, parameters, scoring='roc_auc', cv=kf) print "fitting model..." clf0.fit(X, y) print "best auc score is: ", clf0.best_score_ print "done." fs, aucs, prec, rec = [], [], [], [] fold = 0 complete_X = X.tocsr() clf = LogisticRegression(penalty="l2", fit_intercept=True, class_weight='auto', C=clf0.best_estimator_.C) for train, test in kf: clf.fit(complete_X[train, :].tocoo(), y[train]) probs = clf.predict_proba(complete_X[test, :])[:, 1] #average_precision_score(y[test],probs) precision, recall, threshold = precision_recall_curve(y[test], probs) accuracy = clf.score(complete_X[test, :], y[test]) predLabel = clf.predict(X[test, :]) rec.append(recall_score(y[test], predLabel)) prec.append(precision_score(y[test], predLabel)) #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs)) cur_auc = auc_score(y[test], probs) aucs.append(cur_auc) #preds = clf.predict(complete_X[test]) #fs.append(f1_score(y[test], preds)) ''' if fold == 0: plt.clf() plt.plot(precision,recall) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0,1.05]) plt.xlim([0.0,1.0]) plt.title('Precision-Recall curve for news coverage prediction conditioned on press release with vocabulary size %d' %len(best_features)) plt.show() fold += 1 ''' if fold == 0: fpr, tpr, thresholds = roc_curve(y[test], probs) pylab.clf() fout = "NConPR/roc" pylab.plot( fpr, tpr, label= "ROC curve for news coverage prediction conditioned on press release(area = %0.2f)" % cur_auc) pylab.plot([0, 1], [0, 1], 'k--') pylab.xlim((-0.025, 1.025)) pylab.ylim((-0.025, 1.025)) pylab.xlabel("false positive rate") pylab.ylabel("true positive rate") pylab.title( "ROC curve for news coverage prediction conditioned on press release(area = %0.2f)" % cur_auc) pylab.tight_layout() pylab.show() pylab.savefig(fout) fold += 1 #print "average auc: %s" % (sum(aucs)/float(len(aucs))) #print "average fs: %s" % (sum(fs)/float(len(fs))) print "average recall: %s" % (sum(rec) / float(len(rec))) print "average precision: %s" % (sum(prec) / float(len(prec))) texify_most_informative_features(best_features, vectorizer, clf0) return clf0
import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8833333333333334 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=36), LogisticRegression(C=5.0, dual=False, penalty="l1")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def newcolname_DoS(columns): selector = SelectPercentile(f_classif, percentile=10) true = selector.get_support newcolindex_DoS = [i for i, x in enumerate(true) if x] newcolnames_DoS = list(columns[i] for i in newcolindex_DoS)
def selectKBestUsingData(X_train, x_test): select = SelectPercentile(percentile=50) select.fit() X_train_selected = select.transform(X_train)
imp = SimpleImputer(strategy="most_frequent") data_frame = pandas.DataFrame(imp.fit_transform(data_frame)) # standard_scale = StandardScaler() # data_frame = pandas.DataFrame(standard_scale.fit_transform(data_frame.to_numpy())) # print(data_frame) # Train/Test Split x_train, x_test, y_train, y_test = train_test_split(data_frame, target, random_state=0) print(f'\nTrain data shape: {x_train.shape}') print(f'Test data shape{x_test.shape}') print(f'Target shape {y_test.shape}') # Feature Selection selection = SelectPercentile(percentile=25) selection.fit(x_train, y_train) x_train_compressed = selection.transform(x_train) print(f'\nTrain shape after selection: {x_train_compressed.shape}') selection_status = list(selection.get_support()) print(f'Selection Status: {selection_status} Length: {len(selection_status)}') x_test_compressed = selection.transform(x_test) # Printing Selected Column Names i = 0 selected_columns = [] for status in selection_status: if status: selected_columns.append(data_column_names[i]) i += 1 print(f'Columns After Feature Selection: {selected_columns} Length: {len(selected_columns)}')
def X_newDoS(X_DoS, Y_DoS): np.seterr(divide='ignore', invalid='ignore') selector = SelectPercentile(f_classif, percentile=10) X_newDoS = selector.fit_transform(X_DoS, Y_DoS)
svc_tuned_params = [ { 'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001] }, ] if __name__ == '__main__': rd = OldHamshahriReader(root=config.CORPORA_ROOT) docs, labels = rd.sklearn_docs(config.TOT_DOCS) #vectorizer = CountVectorizer(docs) vectorizer = TfidfVectorizer(lowercase=False, max_df=0.8) fs = vectorizer.fit_transform(docs) #vectorizer.build_preprocessor() selector = SelectPercentile(chi2, percentile=10) selector.fit(fs, labels) fs = selector.transform(fs) fs_train, fs_test, labels_train, labels_test = train_test_split( fs, labels, test_size=0.4, random_state=0) clf = None pred = None grid_search = False if config.CLASSIFIER == 'NaiveBayes': clf = BernoulliNB() elif config.CLASSIFIER == 'LinearSVC': if config.SELF_TRAINING: clf = LinearSVC(C=1) else: clf = GridSearchCV(LinearSVC(),
X=pd.DataFrame(X) #calculate mi mi = mutual_info_classif(X, y) mi = pd.Series(mi)#One-dimensional ndarray with axis labels mi.index = X.columns mi.sort_values(ascending=False, inplace = True) mi.plot.bar(figsize = (16,5)) sel = SelectPercentile(mutual_info_classif, percentile=50).fit(X, y) X.columns[sel.get_support()] X.shape X_mi = sel.transform(X)#Reduce X to the selected features. X_mi.shape #Build the model to compare the performance def run_randomForest(X,y): #meta estimator that fits a number of decision tree clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1) clf.fit(X, y) y_pred = clf.predict(X) print('Accuracy on mi set: ') print(accuracy_score(y, y_pred))
# Designate distributions to sample hyperparameters from C_range = np.power(2, np.arange(-10, 11, dtype=float)) gamma_range = np.power(2, np.arange(-10, 11, dtype=float)) n_features_to_test = [0.85, 0.9, 0.95] #SVM steps = [('scaler', StandardScaler()), ('red_dim', PCA()), ('clf', SVC(kernel='rbf', probability=True))] pipeline = Pipeline(steps) parameteres = [{'scaler':scalers_to_test, 'red_dim':[PCA(random_state=42)], 'red_dim__n_components':list(n_features_to_test), 'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)}, {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(f_classif, percentile=10)], 'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)}, {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(mutual_info_classif, percentile=10)], 'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)}, {'scaler':scalers_to_test, 'red_dim':[None], 'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)}] for j in range(1,2): results, best_estimators_dict = nested_cv.function_nested_cv(public_data, public_labels, pipeline, parameteres, j*2) #create folder and save save_output.function_save_output(results, name, j*2)
print('Base model mean squared error: ' + str(mean_squared_error(y_test, preds))) print('Base model mean squared error: ' + str(explained_variance_score(y_test, preds))) # Create Lasso Linear model with different Alpha values alpha = [0.1, 0.2, 0.25, 0.5, 1.0, 2.5, 5.0] for a in alpha: lasso_mod = linear_model.Lasso(alpha=a, normalize=True, fit_intercept=True) lasso_mod.fit(x_train, y_train) preds = lasso_mod.predict(x_test) print('R2 Lasso model with alpha = ' + str(a) + ': ' + str(r2_score(y_test, preds))) # Create linear model based on F-scores, Percentile based Model selector_f = SelectPercentile(f_regression, percentile=50) selector_f.fit(x_train, y_train) xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test) f_model = linear_model.LinearRegression() f_model.fit(xt_train, y_train) preds = f_model.predict(xt_test) print('R2 Score Percentile Based model: ' + str(r2_score(y_test, preds))) #My own model that I created custom_mod = linear_model.LinearRegression() x_train, x_test, y_train, y_test = train_test_split(data_x_custom, data_y, test_size=0.2, random_state=4) custom_mod.fit(x_train, y_train) preds = custom_mod.predict(x_test)
#SVM steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()), ('clf', SVC(kernel='linear', probability=True))] pipeline = Pipeline(steps) parameteres = [{ 'scaler': [MinMaxScaler()], 'red_dim': [PCA(random_state=42)], 'red_dim__n_components': list(n_features_to_test), 'clf__C': list(C_range), 'clf__class_weight': [None, 'balanced'] }, { 'scaler': [MinMaxScaler()], 'red_dim': [SelectPercentile(f_classif, percentile=10)], 'clf__C': list(C_range), 'clf__class_weight': [None, 'balanced'] }, { 'scaler': [MinMaxScaler()], 'red_dim': [SelectPercentile(mutual_info_classif, percentile=10)], 'clf__C': list(C_range), 'clf__class_weight': [None, 'balanced'] }, { 'scaler': [MinMaxScaler()], 'red_dim': [None], 'clf__C': list(C_range), 'clf__class_weight': [None, 'balanced'] }]
#Apply transform only for continuous data X_train1 = numpy.concatenate((X_train_temp, X_train.iloc[:, size:]), axis=1) #Concatenate scaled continuous data and categorical X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, size:]), axis=1) scaled_features_test_df = pd.DataFrame(data=X_test1, index=X_test.index, columns=X_test.columns) scaled_features_train_df = pd.DataFrame(data=X_train1, index=X_train.index, columns=X_train.columns) # -------------- from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif # Write your solution here: skb = SelectPercentile(score_func=f_classif, percentile=90) predictors = skb.fit_transform(X_train1, Y_train) scores = skb.scores_ Features = scaled_features_train_df.columns dataframe = pd.DataFrame({'Features': Features, 'scores': scores}) dataframe = dataframe.sort_values(by='scores', ascending=False) top_k_predictors = list(dataframe['Features'][:predictors.shape[1]]) print(top_k_predictors) # -------------- from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score clf = LogisticRegression() clf1 = OneVsRestClassifier(clf)
from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.6306355316962473 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_regression, percentile=89), MaxAbsScaler(), StackingEstimator(estimator=LinearSVR(C=0.0001, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=1e-05)), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), LinearSVR(C=1.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=1e-05)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def RegressionScoring(data, target): selector = SelectPercentile(f_regression, percentile=25) selector.fit(data, target) headers = data.dtypes.index for n, s in zip(headers, selector.scores_): print("F score:", s, "for feature", n)
# 获得确定性的随机数 rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # 向数据中添加噪声特征 # X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5) # 使用 SelectPercentile来选择50% 的特征 select = SelectPercentile(percentile=50) select.fit(X_train, y_train) # 对训练集进行变换 X_train_selected = select.transform(X_train) print('X_train.shape: {}'.format(X_train.shape)) print('X_train_selected.shape:{}'.format(X_train_selected.shape)) mask = select.get_support() print(mask) # plt.matshow(mask.reshape(1, -1), cmap='gray_r') # plt.xlabel('Sample index') # plt.show()
def feature_select(x, y): ch2 = SelectPercentile(chi2, 90) ch2.fit(x, y) train_x = ch2.transform(x) return train_x, ch2
from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=12345) # Average CV score on the training set was:-6648.872981389077 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=17), SelectPercentile(score_func=f_regression, percentile=2), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=9, min_child_weight=17, n_estimators=100, nthread=1, subsample=0.5)), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ElasticNetCV(l1_ratio=1.0, tol=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.preprocessing import MinMaxScaler from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.3546954457321903 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=92), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant",
from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.816716439759918 exported_pipeline = make_pipeline( make_union(SelectPercentile(score_func=f_classif, percentile=46), FunctionTransformer(copy)), PCA(iterated_power=8, svd_solver="randomized"), PCA(iterated_power=8, svd_solver="randomized"), LinearSVC(C=0.001, dual=False, loss="squared_hinge", penalty="l2", tol=1e-05)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# For reproducibility np.random.seed(1000) if __name__ == '__main__': # Load Boston data regr_data = load_boston() print('Boston data shape') print(regr_data.data.shape) # Select the best k features with regression test kb_regr = SelectKBest(f_regression) X_b = kb_regr.fit_transform(regr_data.data, regr_data.target) print('K-Best-filtered Boston dataset shape') print(X_b.shape) print('K-Best scores') print(kb_regr.scores_) # Load iris data class_data = load_iris() print('Iris dataset shape') print(class_data.data.shape) # Select the best k features using Chi^2 classification test perc_class = SelectPercentile(chi2, percentile=15) X_p = perc_class.fit_transform(class_data.data, class_data.target) print('Chi2-filtered Iris dataset shape') print(X_p.shape) print('Chi2 scores') print(perc_class.scores_)
from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.60771084700637 exported_pipeline = make_pipeline( make_union( make_union( FunctionTransformer(copy), FunctionTransformer(copy) ), FunctionTransformer(copy) ), SelectPercentile(score_func=f_regression, percentile=89), VarianceThreshold(threshold=0.1), MaxAbsScaler(), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), MaxAbsScaler(), MaxAbsScaler(), LinearSVR(C=0.5, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=1e-05) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
print('Tempo Gasto = ', (fim - inicio)) # ## Fazendo feature selection # In[22]: from sklearn.feature_selection import SelectPercentile, f_regression for i in range(2,12,2): print(str(i*10)+'%') print('-------------') x_new = SelectPercentile(f_regression,percentile=i*10).fit_transform(x,y) x_train,x_test,y_train,y_test = train_test_split(x_new,y,test_size = 0.3, random_state = 42) from lightgbm import LGBMRegressor lgbm = LGBMRegressor(random_state=42) lgbm_model = lgbm.fit(x_train,y_train) lgbm_pred = lgbm_model.predict(x_test) lgbm_pred_train = lgbm_model.predict(x_train) # Verificando a performance do modelo print('treino') print('O Score do Modelo na base de treino é : ',lgbm_model.score(x_train,y_train))
dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.7519859512210725 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), make_union( make_union( make_union( FunctionTransformer(copy), make_union( FunctionTransformer(copy), make_union( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), FunctionTransformer(copy)))), FunctionTransformer(copy)), make_union(FunctionTransformer(copy), FunctionTransformer(copy)))), MinMaxScaler(), SelectPercentile(score_func=f_regression, percentile=87), LinearSVR(C=1.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, 10:col - 1]), axis=1) scaled_features_train_df = pd.DataFrame(X_train1, index=X_train.index, columns=X_train.columns) scaled_features_test_df = pd.DataFrame(X_test1, index=X_test.index, columns=X_test.columns) # -------------- from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif import numpy as np # Write your solution here: sbk = SelectPercentile(score_func=f_classif, percentile=20) predictors = sbk.fit_transform(X_train1, Y_train) scores = sbk.scores_ Features = X_train.columns data = {"Features": Features, "scores": scores} dataframe = pd.DataFrame(data) dataframe = dataframe.sort_values(ascending=False, by='scores') ranges = np.percentile(dataframe['scores'], 80) top_k_predictors = list(dataframe[dataframe['scores'] >= ranges]['Features']) print(top_k_predictors) # -------------- from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
df = pd.DataFrame() # Designate distributions to sample hyperparameters from n_features_to_test = [0.85, 0.9, 0.95] k = np.arange(1,11) #KNeighborsClassifier steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()), ('clf', KNeighborsClassifier())] pipeline = Pipeline(steps) parameteres = [{'scaler':scalers_to_test, 'red_dim':[PCA(random_state=42)], 'red_dim__n_components':n_features_to_test, 'clf__n_neighbors':k, 'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}, {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(f_classif, percentile=10)], 'clf__n_neighbors':k, 'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}, {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(mutual_info_classif, percentile=10)], 'clf__n_neighbors':k, 'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}, {'scaler':scalers_to_test, 'red_dim':[None], 'clf__n_neighbors':k, 'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}] results = nested_cv_3_classes.function_nested_cv_3_classes(data, labels, pipeline, parameteres) #create folder and save save_output.function_save_output(results, name_clf)