'C': [1, 10, 50, 600] }, { 'kernel': ['poly'], 'degree': [2, 3] }, { 'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C': [1, 10, 50, 600] }] metrics = ['precision', 'recall_weighted'] for metric in metrics: print "\n### Searching optimal hyperparameters for ", metric classifier = grid_search.GridSearchCV(svm.SVC(C=1), parameter_grid, cv=5, scoring=metric) classifier.fit(X_train, y_train) print '----- measure scores-------' print "\nScores across the parameter grid:" for params, avg_score, _ in classifier.grid_scores_: print params, '-->', round(avg_score, 3) print "\n Higtest scoring parameter set: ", classifier.best_params_ y_pred = classifier.predict(X_test) print "\nFull performance report:\n" print classification_report(y_test, y_pred)
csp[0:num_pair,:] = W[0:num_pair,:] # 取投影矩阵前几行 csp[num_pair:,:] = W[np.shape(W)[1]-num_pair:,:] # 对应取投影矩阵后几行 feat_train = feat_Generator(eegwin_0_train, eegwin_1_train) # In[用训练集特征训练分类器] parameter_grid = [ {'kernel': ['linear'], 'C': [10 ** x for x in range(-1, 4)]}, {'kernel': ['poly'], 'degree': [2, 3]}, {'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C': [10 ** x for x in range(-1, 4)]}, ] feat_train_X = feat_train[:,:-1] feat_train_y = feat_train[:,-1] print("\n#### Searching optimal hyperparameters for precision") classifier = grid_search.GridSearchCV(svm.SVC(), parameter_grid, cv=5, scoring="accuracy") classifier.fit(feat_train_X, feat_train_y) print("\nScores across the parameter grid:") for params, avg_score, _ in classifier.grid_scores_: print(params, '-->', round(avg_score, 4)) print("\nHighest scoring parameter set:", classifier.best_params_) print("\nHighest performance in training set:", classifier.best_score_) train_avgacc = train_avgacc + classifier.best_score_ # In[用测试集测试分类器] eegwin_0_test, eegwin_1_test = task_Generator(X_test, y_test) feat_test = feat_Generator(eegwin_0_test, eegwin_1_test) feat_test_X = feat_test[:,:-1] feat_test_y = feat_test[:,-1]
m = X.shape[0] rand_index = np.random.permutation(m) X_train = X[rand_index[:0.9 * m], :] X_test = X[rand_index[0.9 * m:], :] y_train = y[rand_index[:0.9 * m]] y_test = y[rand_index[0.9 * m:]] clf = SVC_1(C=1000, kernel='rbf') cv = cross_validation.KFold(X_train.shape[0], n_folds=6) C_vec = np.logspace(-1, 1, 10) param_grid = dict() param_grid['C'] = C_vec gs = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv) gs.fit(X_train, np.ravel(y_train)) print gs.best_params_ print gs.best_estimator_.C print gs.best_score_ scores = np.zeros(C_vec.shape) for i in range(len(C_vec)): for train_indices, test_indices in cv: print train_indices print test_indices print clf.set_params(C=C_vec[i]) scores[i] = clf.fit(X_train[train_indices, :], np.ravel(y_train[train_indices])).score( X_train[test_indices, :],
X_train = titanic_df.drop("Survived",axis=1) Y_train = titanic_df["Survived"] X_test = test_df.drop("PassengerId",axis=1).copy() print X_train # Support Vector Machines from sklearn import grid_search param_range = [0.0001,0.0005,0.001, 0.01, 0.1, 1.0] parameters = { 'C':[1e4,1e5,1e6], 'gamma':[0.00001,0.0001,0.0005,0.001] } clf = SVC() model = grid_search.GridSearchCV(estimator=clf,param_grid=parameters,cv=5,scoring='accuracy') model = model.fit(X_train,Y_train) print model.best_score_ print model.best_params_ svc = SVC(C=model.best_params_['C'],gamma=model.best_params_['gamma']) svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) print Y_pred print svc.score(X_train, Y_train) # Random Forests
#Lets try manually selecting payment features features_manual = ["poi", "salary", "bonus", 'deferral_payments', 'total_payments', 'loan_advances', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'long_term_incentive', 'restricted_stock', 'director_fees'] features_train, features_test, labels_train, labels_test = train_test_data(features_manual) clf4 = DecisionTreeClassifier() print '*** Decesiontree Algorithm with only payments features ***' print test_algorithm(clf4,features_train,features_test), '\n' ## Parameter tuning # Lets repeat 50%classifier with different classifier parameters to see # if we can achieve better result with any other parameter in algorithm features_list = ["poi", "salary", "bonus", "fraction_from_poi_email", "fraction_to_poi_email", 'total_payments', 'total_stock_value', 'expenses', 'exercised_stock_options', 'shared_receipt_with_poi', 'restricted_stock'] features_train, features_test, labels_train, labels_test = train_test_data(features_list) parameters = {'criterion':('gini', 'entropy')} dtc = DecisionTreeClassifier() clf5 = grid_search.GridSearchCV(dtc, parameters) print 'Run Decesion Tree classifier with GridSearchCV' print test_algorithm(clf5,features_train,features_test), '\n' ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf0, my_dataset, features_li)
# In[] #eeg_data = sio.loadmat('CutedEEG.mat')['CutedEEG'] #gait_data = gait_mat_data['FilteredMotion'][0] # 每个元素是受试者走的一次trail;每个trail记录双膝角度轨迹,依次是右膝和左膝 feats_all = sio.loadmat('features.mat')['features'] parameter_grid = [ {'kernel': ['linear'], 'C': [10 ** x for x in range(-1, 4)]}, {'kernel': ['poly'], 'degree': [2, 3]}, {'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C': [10 ** x for x in range(-1, 4)]}, ] X = feats_all[:,:-1] y = feats_all[:,-1] print("\n#### Searching optimal hyperparameters for precision") classifier = grid_search.GridSearchCV(svm.SVC(), parameter_grid, cv=5, scoring='precision_weighted') classifier.fit(X, y) # 直接用实时收集到的数据进行训练,不把数据分出测试集了,直接用在线数据进行测试 print("\nScores across the parameter grid:") for params, avg_score, _ in classifier.grid_scores_: print(params, '-->', round(avg_score, 3)) print("\nHighest scoring parameter set:", classifier.best_params_) #joblib.dump(classifier, time.strftime('%Y_%m_%d_%H_%M_%S',time.localtime(time.time()))+"_SVM.m") # 按当前时间命名保存训练好的分类器 joblib.dump(classifier, "SVM.m") # 保存训练好的分类器 # In[] #max_accuracy = 0 #count = 10.0 # 随机计算准确率的次数 #num_feats = len(feats_all) #ave_accuracy, ave_f1, ave_precision, ave_recall = [],[],[],[]
# Define the parameter grid parameter_grid = [{ 'n_estimators': [100], 'max_depth': [2, 4, 7, 12, 16] }, { 'max_depth': [4], 'n_estimators': [25, 50, 100, 250] }] metrics = ['precision_weighted', 'recall_weighted'] for metric in metrics: print("\n##### Searching optimal parameters for", metric) classifier = grid_search.GridSearchCV(ExtraTreesClassifier(random_state=0), parameter_grid, cv=5, scoring=metric) classifier.fit(X_train, y_train) print("\nGrid scores for the parameter grid:") for params, avg_score, _ in classifier.grid_scores_: print(params, '-->', round(avg_score, 3)) print("\nBest parameters:", classifier.best_params_) y_pred = classifier.predict(X_test) print("\nPerformance report:\n") print(classification_report(y_test, y_pred))
y_train, cv=5, scoring='roc_auc') print('score_cross:', round(np.mean(scores_cross), 5), 'std:', round(np.std(scores_cross), 5)) # grid search on max_depth and min_child_weight param_test1 = {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]} gsearch1 = grid_search.GridSearchCV(estimator=XGBClassifier( learning_rate=0.1, n_estimators=424, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=314), param_grid=param_test1, scoring='roc_auc', iid=False, cv=5) gsearch1.fit(X_train, y_train) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ param_test2 = {'max_depth': [6, 7, 8], 'min_child_weight': [4, 5, 6]} gsearch2 = grid_search.GridSearchCV(estimator=XGBClassifier( learning_rate=0.1, n_estimators=424, max_depth=7,
#features = df_results.drop('result', axis=1).fillna(value=np.finfo(np.float32).min + 1).values #features = df_results.drop('result', axis=1).fillna(method='ffill').fillna(method='bfill').values features = df_results.drop('result', axis=1).fillna(value=0).values scaled_features = scale(features, axis=0) target = df_results['result'].values kf = KFold(scaled_features.shape[0], n_folds=5, shuffle=True, random_state=42) Cs = [10**x for x in range(-5, 6)] lr = LogisticRegression() clf = grid_search.GridSearchCV(estimator=lr, param_grid=dict(C=Cs), n_jobs=4, cv=kf, scoring='roc_auc') clf.fit(scaled_features, target) df_lines_test = pd.read_sql('''select match_ref, l.house_ref , MAX(CASE WHEN l.is_it_starting = 1 THEN l.line_value END) start_value , MAX(CASE WHEN l.is_it_starting = 0 THEN l.line_value END) next_value --, MAX(CASE WHEN l.is_it_starting = 0 THEN l.line_increment END) line_increment from Lines l where 1=1 --and match_ref = 1754 and TS_ref = 1880 and RTV_Ref = 1 GROUP BY match_ref, l.house_ref order by match_ref, l.house_ref''', conn)
], transformer_weights={ 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, #n_jobs = -1 )), ('rfr', rfr) ]) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=1, cv=2, verbose=20, scoring=RMSE) t0 = time() print "Begin training" model.fit(X_train, y_train) t1 = time() print "Training complete: ", t1 - t0, "s" print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) print(model.best_score_ + 0.47003199274)
#print(score2) del pred del y_test_array del score #pred_test=clf.predict_proba(test.drop(['id'], axis=1)) #return pred_test #grid search from sklearn import grid_search paramRF = {'n_estimators':[100], 'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20]} paramET = {'n_estimators':[100], 'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20]} paramXG = {'n_estimators':[100], 'learning_rate':[0.1], 'reg_alpha':[0],'colsample_bytree':[0.1],'colsample_bylevel':[0.1],'max_depth':[5]} param_DT = {'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20], 'max_features':[None,'auto','sqrt']} # 'reg_alpha':[0.2,0.3,0.5,0.7], 'reg_lambda':[0,1,5,10] clfRF = grid_search.GridSearchCV(RandomForestClassifier() , paramRF, cv=2, scoring='log_loss') clfRF.fit( x_train , y_train ) clfRF.best_estimator_ log_loss(y_test, clfRF.predict_proba(x_test)) clfET = grid_search.GridSearchCV(ExtraTreesClassifier() , paramET, cv=2) clfET.fit( x_train , y_train ) clfET.best_estimator_ log_loss(y_test, clfET.predict_proba(x_test)) clfXG = grid_search.GridSearchCV(xgb.XGBClassifier() , paramXG, cv=2, scoring='log_loss') clfXG.fit( x_train , y_train ) clfXG.best_estimator_ log_loss(y_test, clfXG.predict_proba(x_test)) clfDT = grid_search.GridSearchCV(DecisionTreeClassifier() , param_DT, cv=2)
### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) # from sklearn.pipeline import Pipeline from sklearn import svm, grid_search svr = svm.SVC() # change default gamma to 1/n_features parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[1, 10, 100], 'gamma':[0.0625, 1, 10], 'degree':[4, 5, 6, 7, 8]} clf_GridSearch = grid_search.GridSearchCV(svr, parameters, scoring='f1') clf_GridSearch.fit(features_train, labels_train) clf_GridSearch.best_estimator_ clf = svm.SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=4, gamma=0.0625, kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results.
clf.fit(X_train, y_train) preds = clf.predict(X_test) print("Accuracy score on test data on first run") print(accuracy_score(y_test, preds)) print("F-score") print(fbeta_score(y_test, preds, beta=0.5)) # In[250]: # trying optimization with grid_search from sklearn import tree, grid_search from sklearn.metrics import fbeta_score, make_scorer, accuracy_score scorer = make_scorer(fbeta_score, beta=0.5) parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} grid_obj = grid_search.GridSearchCV(clf, parameters, verbose=1, scoring=scorer) #grid_fit = grid_obj.fit(X_train, y_train) #best_clf = grid_fit.best_estimator_ #best_predictions = best_clf.predict(X_test) #print("Best Clf's Accuracy score on test data on first run") #print(accuracy_score(y_test, best_predictions)) #print("Best Clf's F-score") #print(fbeta_score(y_test, best_predictions, beta=0.5)) # In[251]: Y_pred = clf.predict(submission_samples) submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": Y_pred
param_grid = [ { 'C': [0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] GRIDSEARCH = False if GRIDSEARCH == True: clf = grid_search.GridSearchCV(svm.SVC(), param_grid, verbose=10) clf.fit(X, y) with open('best_estimator', 'wb') as f: cPickle.dump(clf.best_estimator_, f) else: cv = cross_validation.ShuffleSplit(len(y), n_iter=1, test_size=0.2) for train, test in cv: train_X = X[train] train_y = y[train] test_X = X[test] test_y = y[test] test_image = images[test]
train = train.astype(float) test = test.astype(float) #0.614773724081 #tune parameters #'max_features': 'sqrt', 'min_samples_split': 5, 'learning_rate': 0.2, 'n_estimators': 100, 'max_depth': 6} gbm = ensemble.GradientBoostingClassifier(random_state=42) params = [{ 'n_estimators': [75, 100, 125], 'min_samples_split': [5, 10], 'max_depth': [6, 8], 'max_features': ['sqrt'], 'learning_rate': [0.2] }] clf = grid_search.GridSearchCV(gbm, params, verbose=1, n_jobs=-1) # cross validation print("k-Fold RMSLE:") cv_rmsle = cross_validation.cross_val_score(clf, train, y, scoring='f1') print(cv_rmsle) print("Mean: " + str(cv_rmsle.mean())) # get predictions on test clf.fit(train, y) # get predictions from the model, convert them and dump them! preds = clf.predict(test) preds = pd.DataFrame({"'Search ID'": id, "cost": preds})
import numpy as np from sklearn import datasets from sklearn.svm import SVC from sklearn.cross_validation import KFold from sklearn import grid_search from sklearn.feature_extraction.text import TfidfVectorizer newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space']) vectorizer = TfidfVectorizer() data = vectorizer.fit_transform(newsgroups.data) features = data true = newsgroups.target grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(true.size, n_folds=5, shuffle=True, random_state=241) clf = SVC(kernel='linear', random_state=241) gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv) gs.fit(features, true) C = gs.best_score_ est = gs.best_estimator_.C print(C) print(est) model = SVC(C=est, kernel='linear', random_state=241) model.fit(features, true) coef0 = model.coef_.toarray()[0] values = abs(coef0) top10 = np.argsort(values)[-10:] #coefabs = abs(model.coef_.data) #print(coefabs) #coefabssort = np.argsort(coefabs)[-10:] feature_mapping = vectorizer.get_feature_names() wr = []
for i, val in _data.iteritems(): label = val['label'] data = val['data'] if label not in _label_data: _label_data[label] = [] _label_data[label].append(data) NUM_EXAMPLES = len(_data) print NUM_EXAMPLES, len(_label_data) print 'done' # ============= Perform SVM classification ============= parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC() clf = grid_search.GridSearchCV(svr, parameters) X = [] Y = [] for k, v in _data.iteritems(): X.append(np.array(v['data']).flatten()) Y.append(v['label']) print 'fitting...' clf.fit(X, Y) # ============= Make predictions ============= test_vectors = [] with open('competition_2/test.data', 'r') as test_data: for i, line in enumerate(test_data):
fore_rf.fit(X_train, y_train) if fold > 0: print('cv: ' + str(np.mean(cross_val_score(fore_rf, X_train, y_train, cv=fold)))) else: print('no cv scores') """ param_grid = { 'n_estimators': [10**1, 10**4], 'max_depth': [2, 4, 10**5], 'min_samples_leaf': [1, 100, 1000] } clf = RandomForestClassifier() grid_clf = grid_search.GridSearchCV(clf, param_grid, cv=fold, verbose=5) grid_clf.fit(X, y) print('best params:' + str(grid_clf.best_params_)) print('best params:' + str(grid_clf.best_score_)) # Part 3 - Making the predictions and evaluating the model # Predicting the Test set results totalpred = grid_clf.predict(X) dataset3['expected'] = totalpred #del X, y,totalpred, X_train, X_test, y_train, y_test from sklearn.metrics import confusion_matrix dataset3['motion_expected'] = dataset3[
data = scipy.io.loadmat(file) ytrain = data['Ytrain'].T.reshape(data['Ytrain'].shape[1]) x_train, x_val, y_train, y_val = cross_validation.train_test_split( data['Xtrain'], ytrain, test_size=0.2, random_state=0) tuned_parameters = [ #{'alpha': [0.15]} { 'alpha': [0.2] } ] print "-- TRAINNING: grid search with 5 fold cross-validation" clf = grid_search.GridSearchCV(BernoulliNB(), tuned_parameters, cv=10, scoring='accuracy') clf.fit(x_train, y_train) print "score : " + str(clf.best_score_) print "params : " + str(clf.best_params_) parametros.append(clf.best_params_) for params, mean_score, scores in clf.grid_scores_: print str(mean_score) + " " + str(scores) + " " + str(params) y_true, y_pred = y_val, clf.predict(x_val) score = accuracy_score(y_true, y_pred) total_score += score cm = confusion_matrix(y_true, y_pred) total = numpy.sum(cm, axis=1)
#!/usr/bin/python3 import numpy as np import json from sklearn import cross_validation, svm, grid_search settings = json.loads(open('settings/grid-search_.json', 'r').read()) data = np.load('bin/train_data.npy') labels = np.load('bin/train_labels.npy') data = data[labels == 1] if 'test_size' in settings.keys(): data, _ = cross_validation.train_test_split( data, test_size=settings['test_size']) del settings['test_size'] print('Training sample shape: {}'.format(data.shape)) kernel_params = settings['params'] del settings['params'] estimator = svm.OneClassSVM() clasificator = grid_search.GridSearchCV(estimator, kernel_params, **settings) model = clasificator.fit(data) print('Best params: {}'.format(str(model.best_params_)))
def train(training_path_a, training_path_b, training_path_c, training_path_d, training_path_e, print_metrics=True): '''Trains a classifier. training_path_a and training_path_b should be directory paths and each of them should not be a subdirectory of the other one. training_path_a and training_path_b are processed by process_directory(). Args: training_path_a (str): directory containing sample images of class A. training_path_b (str): directory containing sample images of class B. print_metrics (boolean, optional): if True, print statistics about classifier performance. Returns: A classifier (sklearn.svm.SVC). ''' if not os.path.isdir(training_path_a): raise IOError('%s is not a directory' % training_path_a) if not os.path.isdir(training_path_b): raise IOError('%s is not a directory' % training_path_b) if not os.path.isdir(training_path_c): raise IOError('%s is not a directory' % training_path_c) if not os.path.isdir(training_path_d): raise IOError('%s is not a directory' % training_path_d) if not os.path.isdir(training_path_e): raise IOError('%s is not a directory' % training_path_e) training_a = process_directory(training_path_a) training_b = process_directory(training_path_b) training_c = process_directory(training_path_c) training_d = process_directory(training_path_d) training_e = process_directory(training_path_e) # data contains all the training data (a list of feature vectors) data = training_a + training_b + training_c + training_d + training_e # target is the list of target classes for each feature vector: a '1' for # class A and '0' for class B target = [4] * len(training_a) + [3] * len(training_b) + [2] * len( training_c) + [1] * len(training_d) + [0] * len(training_e) # split training data in a train set and a test set. The test set will # containt 20% of the total x_train, x_test, y_train, y_test = cross_validation.train_test_split( data, target, test_size=0.20) # define the parameter search space parameters = { 'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001] } # search for the best classifier within the search space and return it clf = grid_search.GridSearchCV(svm.SVC(probability=True), parameters).fit(x_train, y_train) ### save to model local dir ### joblib.dump(clf, 'sport_classification.pkl') ### ### classifier = clf.best_estimator_ if print_metrics: print() print('Parameters:', clf.best_params_) print() print('Best classifier score') print(metrics.classification_report(y_test, classifier.predict(x_test))) return classifier
featurelist = list(dataclean.columns.values) featurelist.remove('Id') featurelist.remove('Response') features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( dataclean[featurelist], dataclean['Response'], test_size=0.1, random_state=42) param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [2, 4], "max_depth": [None, 2, 4], "min_samples_leaf": [1, 3, 5], "class_weight": ["balanced", "balanced_subsample"] } modeloptimal = grid_search.GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring='f1', cv=5) modeloptimal.fit(features_train, labels_train) clf = modeloptimal.best_estimator_ pred = clf.predict(features_test) accuracy = accuracy_score(labels_test, pred) print accuracy
corpusts = testdf['ingredients_string'] vectorizerts = TfidfVectorizer(stop_words='english') tfidfts=vectorizertr.transform(corpusts) predictors_tr = tfidftr targets_tr = traindf['cuisine'] predictors_ts = tfidfts # LR, SCV classifier = LinearSVC(C=0.80, penalty="l2", dual=False) parameters = {'C':[1, 10]} clf = LinearSVC() clf = LogisticRegression() classifier = grid_search.GridSearchCV(clf, parameters) classifier=classifier.fit(predictors_tr,targets_tr) #decision trees #clf = tree.DecisionTreeClassifier() #parameters = {'max_depth':[100]} #classifier=clf.fit(predictors_tr,targets_tr) predictions_train = classifier.predict(predictors_tr) predictions=classifier.predict(predictors_ts) for i in range(0,predictions.size): predictions[i] = str(predictions[i]) for i in range(0,predictions_train.size): predictions_train[i] = str(predictions_train[i])
def serialize_stem_silk(self): start_time = time.time() file_name = self.file_name #define the variables gold_standard_name = self.gold_standard_name N = int(self.N) a = float(self.a) path_to_file = gold_standard_name #data/your_experiment/gs/gs.csv path_to_file = path_to_file.split('/gs/') path_to_file = path_to_file[0] + '/' #data/your_experiment/ path_to_config_file = file_name.split('/') path_to_config_list = path_to_config_file[ 0: -1] #the last element is the name of the file, I just want the path, config/your_experiment/config.xml #turn the list into a string by iterating and summing path_to_config = '' for i in path_to_config_list: path_to_config += i path_to_config += '/' #open files for writing output_file_raw = open( path_to_file + 'ensemble_silk_output_raw_n%d.txt' % N, 'w') #output_file = open('ensemble_duke_stacking_output_T2_n%d.txt' %N,'w') gold_standard_read = open(gold_standard_name, 'rU') #iterate for each tweaked configuration #read actual threshold tree = ET.parse(file_name) root = tree.getroot() for thresh in root.iter('Output'): central_thresh = float(thresh.attrib['minConfidence'] ) #central value of the threshold #parsing the silk xml config file to find the name of the output file for k in root.iter('Output'): for b in k.iter('Param'): if b.attrib['name'] == 'file': output_file_name = b.attrib['value'] thresholds = np.linspace(central_thresh - a / 2, central_thresh + a / 2, N) #list of thresholds for threshold in thresholds: for thresh in root.iter('Output'): thresh.attrib['minConfidence'] = str(threshold) print thresh.attrib['minConfidence'] path_to_config_and_name = path_to_config + 'silk.xml' #dconfig/your_experiment/silk.xml tree.write( path_to_config_and_name) #write the modified xml to file java_command = "java -Xmx5000m -DconfigFile=%s -Dthreads=4 -jar ../lib/Silk/silk.jar" % path_to_config_and_name os.system(java_command) silk_output_name = path_to_config + output_file_name #config/your_experiment/links.nt #open output file silk_output = open(silk_output_name, 'rU') for i in silk_output.readlines(): output_file_raw.write(i) silk_output.close() output_file_raw.write('End of run\n') print "End of run\n" os.system('rm %s' % path_to_config_and_name ) #remove the new modified configuration file output_file_raw.close() #create the training set, named training_set_T1_n%d.csv crt_training = stacking_create_training_set.stacking_create_training_set( path_to_file + 'ensemble_silk_output_raw_n%d.txt' % N, path_to_file + 'training_set_silk_n%d.csv' % N, N) crt_training.stacking_create_training_set_silk(gold_standard_name) #read it and make machine learning on it data = pd.read_csv(path_to_file + 'training_set_silk_n%d.csv' % N) X = data.values[:, 2:(N + 2)] #x variables y = np.array(data['y']) #class variables #fit an SVM with rbf kernel clf = SVC(kernel='rbf', cache_size=1000) parameters = { 'gamma': np.logspace(-9, 3, 30), 'C': np.logspace(-2, 10, 30) } gs_rbf = grid_search.GridSearchCV(clf, param_grid=parameters, cv=4) gs_rbf.fit(X, y) clf = gs_rbf.best_estimator_ joblib.dump(clf, 'svm_model_silk_N%d_a%f.pkl' % (N, a)) print("--- %s seconds ---" % (time.time() - start_time))
from starterPaulDuan import * if __name__ == '__main__': #%% load data x_train, y_train, x_test, id_test = load_data() cols_drop = ['ROLE_CODE'] x_train.drop(cols_drop, axis=1, inplace=True) x_test.drop(cols_drop, axis=1, inplace=True) x_trainb, x_testb = create_feat_ben(x_train, x_test) x_train = sparse.hstack((x_train, x_trainb.as_matrix())).toarray() x_test = sparse.hstack((x_test, x_testb.as_matrix())).tocsr() SEED = 0 model_rf = ensemble.RandomForestClassifier(n_estimators=2000, max_features='sqrt', max_depth=None, min_samples_split=9, random_state=SEED, verbose=10, n_jobs=-1) params = { 'n_estimators': [2500, 3000, 3500], 'max_depth': [20], 'min_samples_split': [3] } # {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 2500}, 0.8910 gridcv = grid_search.GridSearchCV(model_rf, params, scoring='roc_auc', cv=7) gridcv.fit(x_train, y_train)
def serialize_stem_duke(self): start_time = time.time() print 'Starting the entity matching process' file_name = self.file_name #define the variables gold_standard_name = self.gold_standard_name N = int(self.N) a = float(self.a) #open files for writing path_to_file = gold_standard_name path_to_file = path_to_file.split('/gs/') path_to_file = path_to_file[0] + '/' output_file_raw = open( path_to_file + 'ensemble_duke_output_raw_n%d.txt' % N, 'w') path_to_config_file = file_name.split('/') path_to_config_list = path_to_config_file[ 0: -1] #the last element is the name of the file, I just want the path #turn the list into a string by iterating and summing path_to_config = '' for i in path_to_config_list: path_to_config += i path_to_config += '/' #output_file = open('ensemble_duke_stacking_output_T2_n%d.txt' %N,'w') gold_standard_read = open(gold_standard_name, 'rU') #iterate for each tweaked configuration #read actual threshold tree = ET.parse(file_name) root = tree.getroot() for thresh in root.iter('threshold'): central_thresh = float( thresh.text) #central value of the threshold thresholds = np.linspace(central_thresh - a / 2, central_thresh + a / 2, N) for threshold in thresholds: for thresh in root.iter('threshold'): thresh.text = str(threshold) thresh.set('updated', 'yes') path_to_config_and_name = path_to_config + 'duke.xml' tree.write(path_to_config_and_name ) #generate a new modified configuration file java_command = [ "java", "-Xmx5000m", "-cp", "../lib/Duke/duke-core/target/*:../lib/Duke/duke-dist/target/*:../lib/Duke/duke-es/target/*:../lib/Duke/duke-json/target/*:../lib/Duke/duke-lucene/target/*:../lib/Duke/duke-mapdb/target/*:../lib/Duke/duke-mongodb/target/*:../lib/Duke/duke-server/target/*:../lib/Duke/lucene_jar/*", "no.priv.garshol.duke.Duke", "--showmatches", "--batchsize=100000", "--threads=4", "%s" % path_to_config_and_name ] output_file_raw.write( subprocess.check_output(java_command) ) #call duke on the copy.xml file and write the raw output on file output_file_raw.write('\n') output_file_raw.write('End of run\n') print 'End of run\n' os.system('rm %s' % path_to_config_and_name ) #remove the new modified configuration file output_file_raw.close() #create the training set, named training_set_T1_n%d.csv crt_training = stacking_create_training_set.stacking_create_training_set( path_to_file + 'ensemble_duke_output_raw_n%d.txt' % N, path_to_file + 'training_set_n%d.csv' % N, N) crt_training.stacking_create_training_set_duke(gold_standard_name) #stacking_create_training_set(path_to_file+'ensemble_duke_output_raw_n%d.txt' %N,path_to_file+'training_set_n%d.csv' %N, gold_standard_name, N) #read it and make machine learning on it data = pd.read_csv(path_to_file + 'training_set_n%d.csv' % N) X = data.values[:, 2:(N + 2)] #x variables y = np.array(data['y']) #class variables #fit an SVM with rbf kernel clf = SVC(kernel='rbf', cache_size=1000) #parameters = [{'kernel' : ['rbf'],'gamma' : np.logspace(-9,3,30),'C': np.logspace(-2,10,30)}, {'kernel' : ['linear'], 'C': np.logspace(-2,10,30)}] parameters = { 'gamma': np.logspace(-9, 3, 30), 'C': np.logspace(-2, 10, 30) } gs_rbf = grid_search.GridSearchCV(clf, param_grid=parameters, cv=4) gs_rbf.fit(X, y) clf = gs_rbf.best_estimator_ project_name = path_to_config_list[-1] joblib.dump( clf, '../models/%s/svm_model_duke_N%d_a%.1f.pkl' % (project_name, N, a)) print("--- %s seconds ---" % (time.time() - start_time))
def build_model(featureRepresentation='image', dataset_file=None, iters=10, glcm_distance=1, glcm_isMultidirectional=False): ''' Creates, trains and serialises an MLP classifier. Args: featureRepresentation: Type of features to be used in classification. Can ake of one of the values 'image', 'pca' or 'glcm'. dataset_file: filename of serialized data set upon which to build the MLP. If none, default dataset is used. iters: Number of training iterations. glcm_distance: Distance between pixels for co-occurence. Only used if featureRepresentation=glcm. isMultidirectional: Controls whether co-occurence should be calculated in other directions (ie 45 degrees, 90 degrees and 135 degrees). Only used if featureRepresentation=glcm. ''' if (dataset_file == None): # Load train data train_filenames = [] for filename in os.listdir("../train/positive"): if (filename != ".DS_Store"): train_filenames.append("../train/positive/" + filename) train_targets = [1] * (len(os.listdir("../train/positive")) - 1) for filename in os.listdir("../train/negative"): if (filename != ".DS_Store"): train_filenames.append("../train/negative/" + filename) train_targets = train_targets + [0] * ( len(os.listdir("../train/negative")) - 1) n_train_samples = len(train_filenames) if (featureRepresentation == 'glcm'): if (glcm_isMultidirectional): sample_size = 16 else: sample_size = 4 else: sample_size = 20 * 20 train_data = np.zeros((n_train_samples, sample_size)) i = 0 for filename in train_filenames: img = io.imread(filename) if (featureRepresentation == 'image'): train_data[i] = img.flatten() elif (featureRepresentation == 'pca'): train_data[i] = decomposition.PCA( n_components=8).fit_transform(img.flatten()) elif (featureRepresentation == 'glcm'): train_data[i] = Helper.get_textural_features( img, glcm_distance, glcm_isMultidirectional) i = i + 1 # Load test data test_filenames = [] expected = [] for filename in os.listdir("test"): if (filename != ".DS_Store"): test_filenames.append("../test/" + filename) expected.append(int(filename.split('_')[1].split('.')[0])) n_test_samples = len(test_filenames) test_data = np.zeros((n_test_samples, sample_size)) i = 0 for filename in test_filenames: img = io.imread(filename) if (featureRepresentation == 'image'): test_data[i] = img.flatten() elif (featureRepresentation == 'pca'): test_data[i] = decomposition.PCA(n_components=8).fit_transform( img.flatten()) elif (featureRepresentation == 'glcm'): test_data[i] = Helper.get_textural_features( img, glcm_distance, glcm_isMultidirectional) i = i + 1 else: train_data, train_targets, test_data, expected = Helper.unserialize( dataset_file) # Perform build iterations for i in tqdm.tqdm(range(0, iters)): # Build Classifier param_grid = { "algorithm": ["l-bfgs", "sgd", "adam"], "activation": ["logistic", "relu", "tanh"], "hidden_layer_sizes": [(5, 2), (5), (100), (150), (200)] } classifier = grid_search.GridSearchCV(MLPClassifier(), param_grid) classifier.fit(train_data, train_targets) # Get previous classifier and assess serialized_classifier = Helper.unserialize(MLP_FILE) if (serialized_classifier): predictions = serialized_classifier.predict(test_data) confusion_matrix = metrics.confusion_matrix(expected, predictions) serialized_n_correct = confusion_matrix[0][0] + confusion_matrix[ 1][1] predictions = classifier.predict(test_data) confusion_matrix = metrics.confusion_matrix(expected, predictions) n_correct = confusion_matrix[0][0] + confusion_matrix[1][1] if (n_correct > serialized_n_correct): Helper.serialize(MLP_FILE, classifier) else: Helper.serialize(MLP_FILE, classifier) # Display final model performance serialized_classifier = Helper.unserialize(MLP_FILE) predictions = serialized_classifier.predict(test_data) confusion_matrix = metrics.confusion_matrix(expected, predictions) print("Confusion matrix:\n%s" % confusion_matrix) print("Accuracy: %f" % metrics.accuracy_score(expected, predictions)) return serialized_classifier
# 'random_state' : [0], # 'n_jobs' : [4], # 'min_samples_split' : [3], # 'max_depth' : [3] #} parameters = { 'n_estimators': [100, 500, 1000, 1500], 'learning_rate': [0.1, 0.05, 0.01, 0.005], 'max_depth': [4, 6, 8, 10], 'min_samples_leaf': [3, 5, 9, 17, 20], 'max_features': [1.0, 0.3, 0.1] } clf_cv = grid_search.GridSearchCV(GradientBoostingRegressor(), parameters, cv=4, scoring='neg_mean_absolute_error') clf_cv.fit(data_train_s, label_train_s) print("Best Model Parameter: ", clf_cv.best_params_) print("Best Model Score: ", clf_cv.best_score_) print("# PREDICT..") pre = clf_cv.predict(data_test_s) #ac_score = metrics.accuracy_score(label_test_s, pre) ac_score = metrics.mean_absolute_error(label_test_s, pre) print("正解率=", ac_score) result = clf_cv.predict(data_t)
feat = [] for i in range(0, len(parts) - 1): feat.append(float(parts[i])) classes.append(str(int(parts[-1]) - 1)) features.append(np.array(feat)) fo.close() return (features, np.array(classes)) data_set_filepath = 'seeds_dataset.txt' (features, classes) = read_features(data_set_filepath) parameters = {'kernel': ['linear'], 'C': [1.4], 'gamma': [0]} clf = svm.SVC() clf = grid_search.GridSearchCV(clf, parameters, refit=True) clf1 = clf.fit(features, classes) print "\n\tMean Accuracy\tMean Error" for n in range(3, 12): score = cross_validation.cross_val_score(clf, features, classes, cv=n, scoring='accuracy') print str(n) + '\t' + str(score.mean() * 100) + '\t' + str( (1 - score.mean()) * 100) random_state = np.random.RandomState(0) features, classes = shuffle(features, classes, random_state=random_state) half = int(len(features) / 2)
def main(): random.seed(240480) if use_preprocessed_data: print('load preprocessed data') df_train = pd.read_csv('data/train_processed.csv') df_test = pd.read_csv('data/test_processed.csv') else: df_train, df_test = load_data() print('configure data for training') id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train[:] X_test = df_test[:] print('construct model') # TF-IDF vectorize - converts docs to tf-idf feature matrix. tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english') # truncated singular value decomposition - dimensionality reduction. tsvd = TruncatedSVD(n_components=10, random_state=240480) # random forest rfr = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=240480, verbose=1) # TODO: get these features to include some cosine similarity measure between search term and other fields! # think we need to first fit tfidvectoriser to each of title, description, brand # and then insert into pipeline to generate 3x features of search term against the respective vocabs # potentially just include similarity scores as features. or maybe RF will handle this on its own... # pipeline: # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd] # 2. pass to random forest. clf = Pipeline([('union', FeatureUnion(transformer_list=[ ('cst', cust_regression_vals()), ('txt1', Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])), ('txt2', Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])), ('txt3', Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])), ('txt4', Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])) ], transformer_weights={ 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, n_jobs=-1)), ('rfr', rfr)]) print('run grid search') # TODO: search over relative weightings of transformer features? param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} RMSE = make_scorer(fmean_squared_error, greater_is_better=False) model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, scoring=RMSE) model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) print('run predictions') y_pred = model.predict(X_test) print('save submission file') pd.DataFrame({ "id": id_test, "relevance": y_pred }).to_csv('submission.csv', index=False)