def train(index): out_fn = 'training-data-%i.h5' % index if os.path.exists(out_fn): data, labels = classify.load_training_data_from_disk(out_fn, names=['data', 'labels']) else: ws_tr = imio.read_image_stack('watershed-%i.lzf.h5' % index) pr_tr = imio.read_image_stack('probabilities-%i.lzf.h5' % index) / 255 gt_tr = imio.read_image_stack('ground-truth-%i.lzf.h5' % index) g = agglo.Rag(ws_tr, pr_tr, feature_manager=fman) data, labels = g.learn_agglomerate(gt_tr, fman, min_num_epochs=4)[0][:2] classify.save_training_data_to_disk([data, labels], fn='training-data-%i.h5' % index, names=['data', 'labels']) print('total training data:', data.shape) print('size in MB:', data.size * data.itemsize / 1e6) rf = classify.DefaultRandomForest() rf.fit(data, labels[:, 0]) policy = agglo.classifier_probability(fman, rf) return policy
# IPython log file from gala import classify datas = [] labels = [] import numpy as np for i in range(4): data, label = classify.load_training_data_from_disk('training-data-%i.h5' % i, names=['data', 'labels']) datas.append(data) labels.append(label[:, 0]) X0 = np.concatenate(datas, axis=0) y0 = np.concatenate(labels) # runtime was 5min for 3000 samples, expect ~2h for 72,000 # for 280,000, expect ~8h (took 10h) idx = np.random.choice(len(y0), size=280000, replace=False) X, y = X0[idx], y0[idx] param_dist = {'n_estimators': [20, 100, 200, 500], 'max_depth': [3, 5, 20, None], 'max_features': ['auto', 5, 10, 20], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']} from sklearn import grid_search as gs from time import time from sklearn import ensemble rf = ensemble.RandomForestClassifier() random_search = gs.GridSearchCV(rf, param_grid=param_dist, refit=False, verbose=2, n_jobs=12) start=time(); random_search.fit(X, y); stop=time()
# IPython log file # Run this in the NewEM data folder from gala import classify X, y = classify.load_training_data_from_disk('training-data-0.h5', names=['data', 'labels']) train_idxs = np.random.randint(0, X.shape[0], size=10_000) y = y[:, 0] Xtr, ytr = X[train_idxs], y[train_idxs] test_idxs = np.random.randint(0, X.shape[0], size=1000) test_idxs = np.setdiff1d(test_idxs, train_idxs) Xts, yts = X[test_idxs], y[test_idxs] rf = classify.default_random_forest() # get_ipython().magic('timeit -n 1 -r 1 rf.fit(Xtr, ytr)') lg = classify.get_classifier('logist') # get_ipython().magic('timeit -n 1 -r 1 lg.fit(Xtr, ytr)') # 20x faster training lgacc = 1 - np.sum(lg.predict(Xts) != yts) / len(yts) # 73% rfacc = 1 - np.sum(rf.predict(Xts) != yts) / len(yts) # 79.2% # get_ipython().magic('timeit -r 1 -n 1 lg.predict(Xts)') # get_ipython().magic('timeit -r 1 -n 1 rf.predict(Xts)') # 20x faster prediction # get_ipython().magic('timeit rf.predict(Xts[0:1])') # get_ipython().magic('timeit lg.predict(Xts[0:1])') # 30x faster single line prediction from sklearn.preprocessing import StandardScaler s = StandardScaler()