def q20(): def cross_validate(X, y, lamda): Xs = X[:40], X[40:80], X[80:120], X[120:160], X[160:] ys = y[:40], y[40:80], y[80:120], y[120:160], y[160:] reg = ridge.RidgeRegression(lamda) e_cv = 0.0 for i in range(5): X_val, y_val = Xs[i], ys[i] X_train = np.concatenate([Xs[j] for j in range(5) if i != j]) y_train = np.concatenate([ys[j] for j in range(5) if i != j]) reg.fit(X_train, y_train) e_cv += reg.evaluate(X_val, y_val, sign) return e_cv / 5 X_train, y_train = load_train() X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: e_cv = cross_validate(X_train, y_train, lamda) if e_cv <= best[0]: best = (e_cv, lamda) best_reg = ridge.RidgeRegression(best[1]) best_reg.fit(X_train, y_train) print "lamda: %e, E_in: %.3f, E_out: %.3f" % (best[1], best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_test, y_test, sign))
def predict_test(self, path: str, batch_size: int, fout: TextIO): """ predict from test dataset Args: path: test dataset path fout: output file """ data = dataset.load_test(path) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # pylint: disable=no-member tst_itr = BucketIterator(data, device=device, batch_size=batch_size, shuffle=False, train=False, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) print('id,prediction', file=fout) for step, batch in enumerate(tqdm(tst_itr, mininterval=1, ncols=100), start=1): if step % 1000 == 0: logging.info('%dk-th step..') with torch.no_grad(): outputs = self.model(batch.comment_text) for id_, output in zip(batch.id, torch.softmax(outputs, dim=1)): # pylint: disable=no-member print(f'{id_},{output[1].item()}', file=fout)
def compare_manual_vs_model(): with open(DATA_FOLDER + "labels_int.p", "r") as f: y_dict = pickle.load(f) print "Loading test data" X_test, y_test, filenames_test = dataset.load_test() y_pred = joblib.load("../models/pred_ml_improved.pkl") relevant = [] for pred, correct, filename in zip(y_pred, y_test, filenames_test): if filename in FILES: relevant.append((pred, correct, filename, CLASSIFICATIONS[filename])) model_predictions, correct, filename, manual_predictions = zip(*relevant) manual_predictions = learn.multilabel_binary_y(manual_predictions) model_predictions = np.array(model_predictions) correct = learn.multilabel_binary_y(correct) rules = infer_topology.infer_topology_rules() improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions) prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"] predictions = [model_predictions, manual_predictions, improved_manual] for name, pred in zip(prediction_names, predictions): print "\n{}\n--".format(name) print "Zero-one classification loss", zero_one_loss(correct, pred) print "Hamming loss", hamming_loss(correct, pred) print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list) print "Recall :", recall_score(correct, pred, average="weighted", labels=label_list) print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
def lr_with_scale3(): """ Check the performance of normalizing TEST SET. Submission: lr_with_scale3_0707_04.csv E_val: E_in: 0.879233 E_out: 0.8770121701777971 Submission: lr_with_scale3_0712_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) clf = LogisticRegression(C=0.03, class_weight='auto') clf.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale3_0712_01') scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1) logger.debug('E_val: %f <- %s', np.average(scores), scores)
def test(): X_train, y_train = load_train() X_test, y_test = load_test() lamda = 0 reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_in = reg.evaluate(X_train, y_train, sign) e_out = reg.evaluate(X_test, y_test, sign) print "E_in: %.3f, E_out: %.3f" % (e_in, e_out)
def rf(): """ Submission: rf_0708_01.csv 3000 trees E_val: 0.871837 E_in: 0.999998 E_out: 0.882316801296279 15000 trees E_val: 0.872011 E_in: 0.999998 E_out: 0.8824869811781106 30000 trees E_val: 0.871928 E_in: E_out: depth=4; 12000 trees E_val: 0.969158 E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier import numpy as np X, y = dataset.load_train(depth=1) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) del X import gc gc.collect() rf = RandomForestClassifier(n_estimators=12000, oob_score=True, n_jobs=-1, class_weight='auto') rf.fit(X_scaled, y) logger.debug('RandomForestClassifier fitted') logger.debug('E_val(oob): %f', rf.oob_score_) logger.debug('E_in(full): %f', Util.auc_score(rf, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in (depth=0): %f', Util.auc_score(rf, X_scaled, y)) del X gc.collect() IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rf', rf)]), 'rf_0708_01') logger.debug('caching fitted RandomForestClassifier') IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.12000.pkl')) logger.debug('cached fitted RandomForestClassifier')
def to_submission(clf, filename): path = filename if not path.startswith('submission/'): path = 'submission/' + path if not path.endswith('.csv'): path += '.not-submitted.csv' Enroll_test = util.load_enrollment_test()['enrollment_id'] X_test = dataset.load_test() y_test = clf.predict_proba(X_test)[:, 1] lines = ['%d,%f\n' % l for l in zip(Enroll_test, y_test)] with open(path, 'w') as f: f.writelines(lines)
def gbdt_search(): """ Grid search for best n_estimators. Best params: {'loss': 'deviance', 'n_estimators': 100} Submission: gbdt_search_0707_01.csv E_val: 0.883786743214 E_in: 0.887785 E_out: 0.8848760405053878 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() X_scaled = raw_scaler.fit_transform(X) param_grid = { 'loss': ['deviance', 'exponential'], 'n_estimators': np.arange(100, 1001, 100) } params = {'learning_rate': 0.1, 'subsample': 0.5} gb = GradientBoostingClassifier(**params) grid = GridSearchCV(gb, param_grid, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), refit=True, verbose=1) grid.fit(X_scaled, y) logger.debug('Got best GBDT.') logger.debug('Grid scores: ') for i, grid_score in enumerate(grid.grid_scores_): print('\t%d00: %s' % (i + 1, grid_score)) logger.debug('Best score (E_val): %s', grid.best_score_) logger.debug('Best params: %s', grid.best_params_) IO.cache(grid, Path.of_cache('gbdt_search.GridSearchCV.pkl')) X_test = dataset.load_test() raw_scaler.fit(np.r_[X, X_test]) X_scaled = raw_scaler.transform(X) params.update(grid.best_params_) clf = GradientBoostingClassifier(**params) clf.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(grid, X_scaled, y)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', grid)]), 'gbdt_search_0707_01')
def q15(): X_train, y_train = load_train() X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_out = reg.evaluate(X_test, y_test, sign) if e_out <= best[0]: best = (e_out, reg) best_reg = best[1] print "lamda: %e, E_in: %.3f, E_out: %.3f" % ( best_reg.lamda, best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_test, y_test, sign))
def gbdt_grid(): """ Grid search for best params. Best params: {'learning_rate': 0.05, 'subsample': 0.3} Submission: gbdt_grid_0706_03.csv E_val: 0.860118290628 E_in: 0.882949 E_out: 0.8809314555068068 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() X_scaled = raw_scaler.fit_transform(X) param_grid = { 'learning_rate': [0.05, 0.1], 'subsample': [0.3, 0.5, 0.7] } grid = GridSearchCV(GradientBoostingClassifier(n_estimators=3000), param_grid, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), refit=False, verbose=1) grid.fit(X_scaled, y) logger.debug('Got best GBDT.') logger.debug('Grid scores: %s', grid.grid_scores_) logger.debug('Best score (E_val): %s', grid.best_score_) logger.debug('Best params: %s', grid.best_params_) X_test = dataset.load_test() raw_scaler.fit_transform(np.r_[X, X_test]) X_scaled = raw_scaler.transform(X) clf = GradientBoostingClassifier(**grid.best_params_) clf.fit(X_scaled, y) IO.cache(grid, Path.of_cache('gbdt_grid.GridSearchCV.pkl')) logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', clf)]), 'gbdt_grid_0706_03')
def gbdt(): """ Submission: gbdt_0708_02.csv n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.858235 E_in: 0.908622 E_out: 0.8873906795559863 n_estimators: 500, learning_rate: 0.1, subsample: 0.5 E_val: 0.870976 E_in: 0.899593 E_out: 0.88711101837711 n_estimators: 3000, learning_rate: 0.1, subsample: 0.5 E_val: 0.836049 E_in: 0.936056 E_out: 0.8833930861722906 depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.947301 E_in: 0.983812 (on depth=4) // 0.85089646325496504 (on depth=0) E_out: 0.8855316272153549 """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import numpy as np gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, subsample=0.5) d = 0 X, y = dataset.load_train(depth=d) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) gb.fit(X_scaled, y) IO.cache(gb, Path.of_cache('gbdt.GradientBoostingClassifier.d%d.pkl' % d)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]), 'gbdt_0708_02.1000.d%d' % d) logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
def compare_manual_vs_model(): with open(DATA_FOLDER + 'labels_int.p', 'r') as f: y_dict = pickle.load(f) print "Loading test data" X_test, y_test, filenames_test = dataset.load_test() y_pred = joblib.load('../models/pred_ml_improved.pkl') relevant = [] for pred, correct, filename in zip(y_pred, y_test, filenames_test): if filename in FILES: relevant.append( (pred, correct, filename, CLASSIFICATIONS[filename])) model_predictions, correct, filename, manual_predictions = zip(*relevant) manual_predictions = learn.multilabel_binary_y(manual_predictions) model_predictions = np.array(model_predictions) correct = learn.multilabel_binary_y(correct) rules = infer_topology.infer_topology_rules() improved_manual = infer_topology.apply_topology_rules( rules, manual_predictions) prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"] predictions = [model_predictions, manual_predictions, improved_manual] for name, pred in zip(prediction_names, predictions): print "\n{}\n--".format(name) print "Zero-one classification loss", zero_one_loss(correct, pred) print "Hamming loss", hamming_loss(correct, pred) print "Precision:", precision_score(correct, pred, average='weighted', labels=label_list) print "Recall :", recall_score(correct, pred, average='weighted', labels=label_list) print "F1 score :", f1_score(correct, pred, average='weighted', labels=label_list)
def q17(): X, y = load_train() X_train, X_val = X[:120], X[120:] y_train, y_val = y[:120], y[120:] X_test, y_test = load_test() lamdas = np.logspace(-10, 2, 13) best = (1.0, None) for lamda in lamdas: reg = ridge.RidgeRegression(lamda) reg.fit(X_train, y_train) e_val = reg.evaluate(X_val, y_val, sign) if e_val <= best[0]: best = (e_val, reg) best_reg = best[1] print "lamda: %e, E_train: %.3f, E_val: %.3f, E_out: %.3f" % ( best_reg.lamda, best_reg.evaluate(X_train, y_train, sign), best_reg.evaluate(X_val, y_val, sign), best_reg.evaluate(X_test, y_test, sign))
print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred) print "Hamming loss", hamming_loss(y_test_mlb, y_pred) im = y_test_mlb + y_pred * 2 scipy.misc.imsave('predictions.png', im) if __name__ == '__main__': #Load data print "Loading labels" label_list = dataset.load_labels() print "Loading train set" X_train, y_train, filenames_train = dataset.load_train() print "Size of train set", len(X_train) multilabel_classifier(X_train, y_train) #Unload train set from memory del X_train, y_train, filenames_train print "Loading test set" X_test, y_test, filenames_test = dataset.load_test() print "Size of test set", len(X_test) predict(X_test, y_test) improve_predictions(use_infer_topology=True) #evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl') evaluate_multilabel(y_test, label_list, '../models/pred_ml.pkl')
print "Zero-one classification loss", zero_one_loss(y_test_mlb, y_pred) print "Hamming loss", hamming_loss(y_test_mlb, y_pred) im = y_test_mlb + y_pred * 2 scipy.misc.imsave("predictions.png", im) if __name__ == "__main__": # Load data print "Loading labels" label_list = dataset.load_labels() print "Loading train set" X_train, y_train, filenames_train = dataset.load_train() print "Size of train set", len(X_train) multilabel_classifier(X_train, y_train) # Unload train set from memory del X_train, y_train, filenames_train print "Loading test set" X_test, y_test, filenames_test = dataset.load_test() print "Size of test set", len(X_test) predict(X_test, y_test) improve_predictions(use_infer_topology=True) # evaluate_multilabel(y_test, label_list, '../models/pred_ml_improved.pkl') evaluate_multilabel(y_test, label_list, "../models/pred_ml.pkl")
def gbdt2(): """ Submission: gbdt2_0708_03.csv n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.852035 E_in: 0.910251 E_out: 0.8874428893001793 n_estimators: 3000, learning_rate: 0.1, subsample: 0.5 E_val: 0.827988 E_in: 0.938593 E_out: 0.8844206314551558 depth=4; n_estimators: 1000, learning_rate: 0.1, subsample: 0.5 E_val: 0.941602 E_in: 0.983938 (on depth=4) // 0.87209181108731892 (on depth=0) E_out: 0.8872206627768779 depth=0: E_val: E_in: 0.909368 // 0.909368 E_out: 0.8864839071529611 depth=1: E_val: E_in: 0.956676 // 0.903537 E_out: 0.8851856544683128 depth=2: E_val: E_in: 0.971240 // 0.899843 E_out: depth=3: E_val: E_in: 0.978190 // 0.896956 E_out: """ from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import numpy as np gb = GradientBoostingClassifier(loss='exponential', n_estimators=1000, learning_rate=0.1, subsample=0.5) d = 3 X, y = dataset.load_train(depth=d) raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) gb.fit(X_scaled, y) IO.cache(gb, Path.of_cache('gbdt2.GradientBoostingClassifier.d%d.pkl' % d)) IO.dump_submission(Pipeline([('scaler', raw_scaler), ('gbdt', gb)]), 'gbdt2_0708_03.1000.d%d' % d) logger.debug('E_in(full): %f', Util.auc_score(gb, X_scaled, y)) X, y = dataset.load_train() X_scaled = raw_scaler.transform(X) logger.debug('E_in(depth=0): %f', Util.auc_score(gb, X_scaled, y))
# Train path has the class structure classes = os.listdir(train_path) num_classes = len(classes) print("num of classes:", num_classes) #Keeping image size as 128 img_size = 128 num_channels = 3 # We shall load all the training and validation images and labels into memory using openCV and use that during training train_data = dataset.read_train_sets(train_path, img_size, classes, validation_size=0.2) print("Complete reading input data. Will Now print a snippet of it") print("Number of files in Training-set:\t{}".format(len(train_data.train.labels))) print("Number of files in Validation-set:\t{}".format(len(train_data.valid.labels))) test_data = dataset.load_test(test_path, img_size) print("Number of files in test-set:\t{}".format(len(test_data))) batch_size = 8 num_iterations = 3 #session, y_pred_cls = tensorflow_model.load_tensorflow(train_data, batch_size, num_iterations, img_size) #x, y_true = tensorflow_model.input_placeholders(img_size, num_channels, num_classes) #y_true_cls = tf.argmax(y_true, dimension=1) #prediction = session.run(y_pred_cls, feed_dict={'x': test_data}) #print(prediction) # save the model #saver = tf.train.Saver()