예제 #1
0
def process_one_file(f):
    print f
    csv_r = csv.reader(open(f))
    csv_r.next() # jump header
    x,y=[],[]

    for r in csv_r:
        tmp_t = datetime.datetime.strptime(r[1],'%Y-%m-%d %H:%M:%S')
        hour = r[2]
        minutes = r[3]
        v_occ_min = float(r[4])
        w_occ_min = float(r[5])
        v_occ = float(r[6])
        win = float(r[7])
        wout = float(r[8])
        raw_v_occ = float(r[9])
        _y = float(r[-1])
        x.append([hour,minutes,v_occ_min,w_occ_min,v_occ,win,wout,raw_v_occ])
        y.append(_y)

    x,y=np.array(x),np.array(y)
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=233)

    t = GradientBoostingRegressor()
    t.fit(x_train,y_train)
    predict = t.predict(x_test)
    predict_all = t.predict(x)
    print 'gbrt',f_mae(predict,y_test),f_rms(predict,y_test),f_mae(predict_all,y),f_rms(predict_all,y)

    # define base models
    base_models = [GradientBoostingRegressor(n_estimators=100),
                   RandomForestRegressor(n_estimators=100, n_jobs=-1),
                   ExtraTreesRegressor(n_estimators=100, n_jobs=-1)]

    # define blending model
    blending_model = LinearRegression()

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model,
                            n_folds=N_FOLDS, verbose=VERBOSE)

    # fit model
    sg.fit(x_train,y_train)
    predict = sg.predict(x_test)
    predict_all = sg.predict(x)
    print 'stack', f_mae(predict, y_test), f_rms(predict, y_test), f_mae(predict_all, y), f_rms(predict_all, y)

    print ''
        t1 = times[j]

        relative_size_change = (s0 - s1) / (s0 + s1)
        relative_velocity = (x0 - x1) / (t1 - t0)
        candidate_samples[i * n_test + j][0:2] = relative_size_change
        candidate_samples[i * n_test + j][2:4] = relative_velocity
        net.blobs['data'].data[0, 0:5, ...] = isolated_test_data_img[i]
        net.blobs['data'].data[0, 5:10, ...] = isolated_test_data_img[j]
        tmp_out = net.forward()
        cnn_prediction = tmp_out['fc6'].copy()
        print cnn_prediction
        candidate_samples[i * n_test + j][4:516] = cnn_prediction

candidate_samples[np.isfinite(candidate_samples) == False] = 0
candidate_samples[np.isnan(candidate_samples) == True] = 0
pred = sg.predict(candidate_samples)
pred_labels = np.zeros(pred.shape[0])
for i in range(0, pred.shape[0]):
    if pred[i, 0] > pred[i, 1]:
        pred_labels[i] = pred[i, 0]
    else:
        pred_labels[i] = pred[i, 1]


def save_data_as_hdf5_results(hdf5_data_filename, data, predict_labels):
    '''
    HDF5 is one of the data formats Caffe accepts
    '''
    with h5py.File(hdf5_data_filename, 'w') as f:
        f['data'] = data.astype(np.float32)
        f['label'] = predict_labels.astype(np.float32)
                relative_size_change = (s0 - s1) / (s0 + s1)
                relative_velocity = (x0 - x1) / (t1 - t0)
                candidate_samples[i * n_test + j][0:2] = relative_size_change
                candidate_samples[i * n_test + j][2:4] = relative_velocity
                net.blobs['data'].data[0, 0:5, ...] = isolated_test_data_img[i]
                net.blobs['data'].data[0, 5:10,
                                       ...] = isolated_test_data_img[j]
                tmp_out = net.forward()
                cnn_prediction = tmp_out['fc6'].copy()
                # print cnn_prediction
                candidate_samples[i * n_test + j][4:516] = cnn_prediction

        candidate_samples[np.isfinite(candidate_samples) == False] = 0
        candidate_samples[np.isnan(candidate_samples) == True] = 0
        pred = sg.predict(candidate_samples)
        pred_labels = np.zeros(pred.shape[0])
        for i in range(0, pred.shape[0]):
            if pred[i, 0] > pred[i, 1]:
                pred_labels[i] = pred[i, 0]
            else:
                pred_labels[i] = pred[i, 1]
        s_array = isolated_test_scores
        #print s_array
        s_matrix = np.array(pred_labels).reshape((n_test, n_test))
        #print s_matrix

        index = ' '

        for i in range(0, n_test):
            index += str(i)
shuffle_idx = np.random.permutation(y.shape[0])

X = train_sample[shuffle_idx]
y = y[shuffle_idx]

# hold out 20 percent of data for testing accuracy
train_prct = 0.8
n_train = int(round(X.shape[0]*train_prct))

# define base models
base_models = [GradientBoostingClassifier(n_estimators=100),
               GradientBoostingClassifier(n_estimators=100),
               GradientBoostingClassifier(n_estimators=100)]

# define blending model
blending_model = LogisticRegression()

# initialize multi-stage model
sg = StackedGeneralizer(base_models, blending_model, 
                        n_folds=N_FOLDS, verbose=VERBOSE)

# fit model
sg.fit(X[:n_train],y[:n_train])

# test accuracy
pred = sg.predict(X[n_train:])
pred_classes = [np.argmax(p) for p in pred]

_ = sg.evaluate(y[n_train:], pred_classes)
예제 #5
0
y = data.target

shuffle_idx = np.random.permutation(y.size)

X = X[shuffle_idx]
y = y[shuffle_idx]

# hold out 20 percent of data for testing accuracy
train_prct = 0.8
n_train = int(round(X.shape[0]*train_prct))

# define base models
base_models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
               RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
               ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini')]

# define blending model
blending_model = LogisticRegression()

# initialize multi-stage model
sg = StackedGeneralizer(base_models, blending_model,
                        n_folds=N_FOLDS, verbose=VERBOSE)

# fit model
sg.fit(X[:n_train],y[:n_train])

# test accuracy
pred = sg.predict(X[n_train:])
pred_classes = [np.argmax(p) for p in pred]

_ = sg.evaluate(y[n_train:], pred_classes)