def process_one_file(f): print f csv_r = csv.reader(open(f)) csv_r.next() # jump header x,y=[],[] for r in csv_r: tmp_t = datetime.datetime.strptime(r[1],'%Y-%m-%d %H:%M:%S') hour = r[2] minutes = r[3] v_occ_min = float(r[4]) w_occ_min = float(r[5]) v_occ = float(r[6]) win = float(r[7]) wout = float(r[8]) raw_v_occ = float(r[9]) _y = float(r[-1]) x.append([hour,minutes,v_occ_min,w_occ_min,v_occ,win,wout,raw_v_occ]) y.append(_y) x,y=np.array(x),np.array(y) x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=233) t = GradientBoostingRegressor() t.fit(x_train,y_train) predict = t.predict(x_test) predict_all = t.predict(x) print 'gbrt',f_mae(predict,y_test),f_rms(predict,y_test),f_mae(predict_all,y),f_rms(predict_all,y) # define base models base_models = [GradientBoostingRegressor(n_estimators=100), RandomForestRegressor(n_estimators=100, n_jobs=-1), ExtraTreesRegressor(n_estimators=100, n_jobs=-1)] # define blending model blending_model = LinearRegression() # initialize multi-stage model sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE) # fit model sg.fit(x_train,y_train) predict = sg.predict(x_test) predict_all = sg.predict(x) print 'stack', f_mae(predict, y_test), f_rms(predict, y_test), f_mae(predict_all, y), f_rms(predict_all, y) print ''
t1 = times[j] relative_size_change = (s0 - s1) / (s0 + s1) relative_velocity = (x0 - x1) / (t1 - t0) candidate_samples[i * n_test + j][0:2] = relative_size_change candidate_samples[i * n_test + j][2:4] = relative_velocity net.blobs['data'].data[0, 0:5, ...] = isolated_test_data_img[i] net.blobs['data'].data[0, 5:10, ...] = isolated_test_data_img[j] tmp_out = net.forward() cnn_prediction = tmp_out['fc6'].copy() print cnn_prediction candidate_samples[i * n_test + j][4:516] = cnn_prediction candidate_samples[np.isfinite(candidate_samples) == False] = 0 candidate_samples[np.isnan(candidate_samples) == True] = 0 pred = sg.predict(candidate_samples) pred_labels = np.zeros(pred.shape[0]) for i in range(0, pred.shape[0]): if pred[i, 0] > pred[i, 1]: pred_labels[i] = pred[i, 0] else: pred_labels[i] = pred[i, 1] def save_data_as_hdf5_results(hdf5_data_filename, data, predict_labels): ''' HDF5 is one of the data formats Caffe accepts ''' with h5py.File(hdf5_data_filename, 'w') as f: f['data'] = data.astype(np.float32) f['label'] = predict_labels.astype(np.float32)
relative_size_change = (s0 - s1) / (s0 + s1) relative_velocity = (x0 - x1) / (t1 - t0) candidate_samples[i * n_test + j][0:2] = relative_size_change candidate_samples[i * n_test + j][2:4] = relative_velocity net.blobs['data'].data[0, 0:5, ...] = isolated_test_data_img[i] net.blobs['data'].data[0, 5:10, ...] = isolated_test_data_img[j] tmp_out = net.forward() cnn_prediction = tmp_out['fc6'].copy() # print cnn_prediction candidate_samples[i * n_test + j][4:516] = cnn_prediction candidate_samples[np.isfinite(candidate_samples) == False] = 0 candidate_samples[np.isnan(candidate_samples) == True] = 0 pred = sg.predict(candidate_samples) pred_labels = np.zeros(pred.shape[0]) for i in range(0, pred.shape[0]): if pred[i, 0] > pred[i, 1]: pred_labels[i] = pred[i, 0] else: pred_labels[i] = pred[i, 1] s_array = isolated_test_scores #print s_array s_matrix = np.array(pred_labels).reshape((n_test, n_test)) #print s_matrix index = ' ' for i in range(0, n_test): index += str(i)
shuffle_idx = np.random.permutation(y.shape[0]) X = train_sample[shuffle_idx] y = y[shuffle_idx] # hold out 20 percent of data for testing accuracy train_prct = 0.8 n_train = int(round(X.shape[0]*train_prct)) # define base models base_models = [GradientBoostingClassifier(n_estimators=100), GradientBoostingClassifier(n_estimators=100), GradientBoostingClassifier(n_estimators=100)] # define blending model blending_model = LogisticRegression() # initialize multi-stage model sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE) # fit model sg.fit(X[:n_train],y[:n_train]) # test accuracy pred = sg.predict(X[n_train:]) pred_classes = [np.argmax(p) for p in pred] _ = sg.evaluate(y[n_train:], pred_classes)
y = data.target shuffle_idx = np.random.permutation(y.size) X = X[shuffle_idx] y = y[shuffle_idx] # hold out 20 percent of data for testing accuracy train_prct = 0.8 n_train = int(round(X.shape[0]*train_prct)) # define base models base_models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini')] # define blending model blending_model = LogisticRegression() # initialize multi-stage model sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE) # fit model sg.fit(X[:n_train],y[:n_train]) # test accuracy pred = sg.predict(X[n_train:]) pred_classes = [np.argmax(p) for p in pred] _ = sg.evaluate(y[n_train:], pred_classes)