# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import random from sklearn import datasets, metrics, cross_validation import skflow random.seed(42) # Load dataset. iris = datasets.load_iris() X_train, X_test, y_train, y_test = cross_validation.train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) # Build 3 layer DNN with 10, 20, 10 units respecitvely. classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3, steps=200) # Fit and predict. classifier.fit(X_train, y_train) score = metrics.accuracy_score(y_test, classifier.predict(X_test)) print('Accuracy: {0:f}'.format(score))
print "--------------------------------------------------" print "GradientBoostingClassifier" print "--------------------------------------------------" print 'Accuracy:', accuracy_score(y_test.flatten(), pred.flatten()) print 'F1 score:', f1_score(y_test.flatten(), pred.flatten()) print 'Recall:', recall_score(y_test.flatten(), pred.flatten()) print 'Precision:', precision_score(y_test.flatten(), pred.flatten()) print '\n clasification report:\n', classification_report( y_test.flatten(), pred.flatten()) #print 'Gradient boosting score: %f' % accuracy_score(y_test.flatten(), pred.flatten()) # Deep Learning - DNN used the library skflow which is based on tensorflow import skflow model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=17, batch_size=100, steps=3000, optimizer="SGD", learning_rate=0.01) model.fit(X_train, y_train.values) #y_test = test_data['Y'] y_prediction = model.predict(X_test) print "prediction accuracy:", np.sum( y_test.flatten() == y_prediction) * 1. / len(y_test.flatten())
def do_system_training(dataset, model_path, feature_normalizer_path, feature_path, classifier_params, dataset_evaluation_mode='folds', classifier_method='gmm', overwrite=False): """System training model container format: { 'normalizer': normalizer class 'models' : { 'office' : mixture.GMM class 'home' : mixture.GMM class ... } } Parameters ---------- dataset : class dataset class model_path : str path where the models are saved. feature_normalizer_path : str path where the feature normalizers are saved. feature_path : str path where the features are saved. classifier_params : dict parameter dict dataset_evaluation_mode : str ['folds', 'full'] evaluation mode, 'full' all material available is considered to belong to one fold. (Default value='folds') classifier_method : str ['gmm'] classifier method, currently only GMM supported (Default value='gmm') overwrite : bool overwrite existing models (Default value=False) Returns ------- nothing Raises ------- ValueError classifier_method is unknown. IOError Feature normalizer not found. Feature file not found. """ if classifier_method != 'gmm' and classifier_method != 'dnn': raise ValueError("Unknown classifier method [" + classifier_method + "]") # Check that target path exists, create if not check_path(model_path) for fold in dataset.folds(mode=dataset_evaluation_mode): current_model_file = get_model_filename(fold=fold, path=model_path) if not os.path.isfile(current_model_file) or overwrite: # Load normalizer feature_normalizer_filename = get_feature_normalizer_filename( fold=fold, path=feature_normalizer_path) if os.path.isfile(feature_normalizer_filename): normalizer = load_data(feature_normalizer_filename) else: raise IOError("Feature normalizer not found [%s]" % feature_normalizer_filename) # Initialize model container model_container = {'normalizer': normalizer, 'models': {}} # Collect training examples file_count = len(dataset.train(fold)) data = {} for item_id, item in enumerate(dataset.train(fold)): progress(title_text='Collecting data', fold=fold, percentage=(float(item_id) / file_count), note=os.path.split(item['file'])[1]) # Load features feature_filename = get_feature_filename( audio_file=item['file'], path=feature_path) if os.path.isfile(feature_filename): feature_data = load_data(feature_filename)['feat'] else: raise IOError("Features not found [%s]" % (item['file'])) # Scale features feature_data = model_container['normalizer'].normalize( feature_data) # Store features per class label if item['scene_label'] not in data: data[item['scene_label']] = feature_data else: data[item['scene_label']] = numpy.vstack( (data[item['scene_label']], feature_data)) le = pp.LabelEncoder() tot_data = {} # Train models for each class for label in data: progress(title_text='Train models', fold=fold, note=label) if classifier_method == 'gmm': model_container['models'][label] = mixture.GMM( **classifier_params).fit(data[label]) elif classifier_method == 'dnn': if 'x' not in tot_data: tot_data['x'] = data[label] tot_data['y'] = numpy.repeat(label, len(data[label]), axis=0) else: tot_data['x'] = numpy.vstack( (tot_data['x'], data[label])) tot_data['y'] = numpy.hstack( (tot_data['y'], numpy.repeat(label, len(data[label]), axis=0))) else: raise ValueError("Unknown classifier method [" + classifier_method + "]") clf = skflow.TensorFlowDNNClassifier(**classifier_params) if classifier_method == 'dnn': tot_data['y'] = le.fit_transform(tot_data['y']) clf.fit(tot_data['x'], tot_data['y']) clf.save('dnn/dnnmodel1') # Save models save_data(current_model_file, model_container)
# limitations under the License. from sklearn import datasets, metrics from sklearn.cross_validation import train_test_split import skflow import tensorflow as tf iris = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) # setup exponential decay function def exp_decay(global_step): return tf.train.exponential_decay(learning_rate=0.1, global_step=global_step, decay_steps=100, decay_rate=0.001) # use customized decay function in learning_rate classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3, steps=800, learning_rate=exp_decay) classifier.fit(X_train, y_train) score = metrics.accuracy_score(y_test, classifier.predict(X_test))
def main(): """Run experiment with multiple classifiers.""" data = get_data() print("Got %i training samples and %i test samples." % (len(data['train']['X']), len(data['test']['X']))) # Get classifiers classifiers = [ ('Logistic Regression (C=1)', LogisticRegression(C=1)), ('Logistic Regression (C=1000)', LogisticRegression(C=10000)), ('RBM 200, n_iter=40, LR=0.01, Reg: C=1', Pipeline(steps=[( 'rbm', BernoulliRBM( n_components=200, n_iter=40, learning_rate=0.01, verbose=True) ), ('logistic', LogisticRegression(C=1))])), ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000', Pipeline(steps=[( 'rbm', BernoulliRBM( n_components=200, n_iter=40, learning_rate=0.01, verbose=True) ), ('logistic', LogisticRegression(C=10000))])), ('RBM 100', Pipeline(steps=[('rbm', BernoulliRBM( n_components=100)), ('logistic', LogisticRegression(C=1))])), ('RBM 100, n_iter=20', Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20) ), ('logistic', LogisticRegression(C=1))])), ('RBM 256', Pipeline(steps=[('rbm', BernoulliRBM( n_components=256)), ('logistic', LogisticRegression(C=1))])), ('RBM 512, n_iter=100', Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10) ), ('logistic', LogisticRegression(C=1))])), ('NN 20:5', skflow.TensorFlowDNNClassifier(hidden_units=[20, 5], n_classes=data['n_classes'], steps=500)), # ('NN 500:200 dropout', # skflow.TensorFlowEstimator(model_fn=dropout_model, # n_classes=10, # steps=20000)), # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model, # n_classes=10, # batch_size=100, # steps=20000, # learning_rate=0.001)), ('SVM, adj.', SVC(probability=False, kernel="rbf", C=2.8, gamma=.0073, cache_size=200)), ('SVM, linear', SVC(kernel="linear", C=0.025, cache_size=200)), ('k nn', KNeighborsClassifier(3)), ('Decision Tree', DecisionTreeClassifier(max_depth=5)), ('Random Forest', RandomForestClassifier(n_estimators=50, n_jobs=10)), ('Random Forest 2', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=10)), ('AdaBoost', AdaBoostClassifier()), ('Naive Bayes', GaussianNB()), ('Gradient Boosting', GradientBoostingClassifier()), ('LDA', LinearDiscriminantAnalysis()), ('QDA', QuadraticDiscriminantAnalysis()) ] # Fit them all classifier_data = {} for clf_name, clf in classifiers: print("#" * 80) print("Start fitting '%s' classifier." % clf_name) examples = 100000 # Reduce data to make training faster t0 = time.time() clf.fit(data['train']['X'][:examples], data['train']['y'][:examples]) t1 = time.time() an_data = analyze(clf, data, t1 - t0, clf_name=clf_name) classifier_data[clf_name] = { 'training_time': t1 - t0, 'testing_time': an_data['testing_time'], 'accuracy': an_data['accuracy'] } print_website(classifier_data)
# Calculate accuracy accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) print "Accuracy:", accuracy.eval({x: X_test, y: y_test}) # Skflow y, X = train['Survived'], train[['Age', 'SibSp', 'Fare']].fillna(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train = numpy.array(X_train) X_test = numpy.array(X_test) y_train = numpy.array(y_train).reshape(y_train.shape[-1], 1) y_test = numpy.array(y_test).reshape(y_test.shape[-1], 1) classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, batch_size=128, steps=500, optimizer='Adam', learning_rate=0.05) classifier.fit(X_train, y_train) print(accuracy_score(classifier.predict(X_test), y_test)) # TensorFlowDNNClassifier(batch_size=128, class_weight=None, # continue_training=False, early_stopping_rounds=None, # hidden_units=[10, 20, 10], keep_checkpoint_every_n_hours=10000, # learning_rate=0.05, max_to_keep=5, n_classes=2, num_cores=4, # optimizer='SGD', steps=500, tf_master='', tf_random_seed=42, # verbose=1)
classifier = skflow.TensorFlowLinearClassifier(n_classes=10, batch_size=100, steps=1000, learning_rate=0.01) classifier.fit(X_train, y_train) linear_y_predict = classifier.predict(X_test) linear_submission = pd.DataFrame({ 'ImageId': range(1, 28001), 'Label': linear_y_predict }) linear_submission.to_csv('../Datasets/MNIST/linear_submission.csv', index=False) classifier = skflow.TensorFlowDNNClassifier(hidden_units=[200, 50, 10], n_classes=10, steps=5000, learning_rate=0.01, batch_size=50) classifier.fit(X_train, y_train) dnn_y_predict = classifier.predict(X_test) dnn_submission = pd.DataFrame({ 'ImageId': range(1, 28001), 'Label': dnn_y_predict }) dnn_submission.to_csv('../Datasets/MNIST/dnn_submission.csv', index=False) def max_pool_2x2(tensor_in): return tf.nn.max_pool(tensor_in, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
from sklearn import datasets, metrics from sklearn.cross_validation import train_test_split import skflow random.seed(42) iris = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) # classifier without early stopping - overfitting classifier1 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3, steps=800) classifier1.fit(X_train, y_train) score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test)) # classifier with early stopping - improved accuracy on testing set classifier2 = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3, steps=1000, early_stopping_rounds=200) classifier2.fit(X_train, y_train) score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test)) # you can expect the score is improved by using early stopping print(score2 > score1)
if data[f].dtype == 'object': catagorical_features.append(f) else: numeric_features.append(f) data_num = whiten(data[numeric_features]) data_cat = pd.get_dummies(data[catagorical_features], columns=catagorical_features) trlen = train_data.shape[0] train = np.hstack((data_num[:trlen], data_cat[:trlen])) test = np.hstack((data_num[trlen:], data_cat[trlen:])) labels = label_data.astype(int) xtrain, xtest, ytrain, ytest = train_test_split(train, labels, train_size=0.7) model = skflow.TensorFlowDNNClassifier(hidden_units=[128, 128, 128], learning_rate=0.01, n_classes=2, batch_size=128, steps=10000) model.fit(xtrain, ytrain) p = model.predict_proba(xtest)[:, 1] print("TensorFlowDNNClassifier log_loss: %0.5f" % (log_loss(ytest, p))) model.fit(train, labels) preds = model.predict_proba(test)[:, 1] sample = pd.read_csv("results/sample_submission.csv") sample.PredictedProb = preds sample.to_csv("results/simple_skflow_results.csv", index=False)
X_train = read_wiki_content("doc2vec_train_content.txt") X_test = read_wiki_content("doc2vec_test_content.txt") print("Dimension of input: ", len(X_train[0])) print('Using DNN') hidden_units = [2000, 1000, 500, 200] steps = 50000 early_stopping_rounds = 5000 print("Parameters: ", hidden_units, " steps = ", steps, " early_stopping_rounds = ", early_stopping_rounds) classifier = skflow.TensorFlowDNNClassifier( hidden_units=hidden_units, n_classes=6, steps=steps, early_stopping_rounds=early_stopping_rounds) print('Fit model') classifier.fit(X_train, Y_train, logdir="./logdir/doc2vec_dnn") print('Predicting') prediction = classifier.predict(X_test) score2 = metrics.accuracy_score(prediction, Y_test) confusion_matrix = metrics.confusion_matrix(Y_test, prediction) print(confusion_matrix) print(score2)