예제 #1
0
def simple_naive_bayes(X, y):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y[:nTrain]
  Xtest = X[nTrain:,:]
  ytest = y[nTrain:]

  clf = MultinomialNB().fit(Xtrain, ytrain)
  predict_y = clf.predict(Xtest)
  print ytest
  print predict_y
  print rmsle(ytest, predict_y)
예제 #2
0
def simple_naive_bayes(X, y):
    n, _ = X.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    Xtrain = X[:nTrain, :]
    ytrain = y[:nTrain]
    Xtest = X[nTrain:, :]
    ytest = y[nTrain:]

    clf = MultinomialNB().fit(Xtrain, ytrain)
    predict_y = clf.predict(Xtest)
    print ytest
    print predict_y
    print rmsle(ytest, predict_y)
예제 #3
0
def logistic_regression(X, y):
    n, _ = X.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    Xtrain = X[:nTrain, :]
    ytrain = y[:nTrain]
    Xtest = X[nTrain:, :]
    ytest = y[nTrain:]

    for i, C in enumerate(10.**np.arange(1, 6)):
        clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
        clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
        clf_l1_LR.fit(Xtrain, ytrain)
        clf_l2_LR.fit(Xtrain, ytrain)

        y1 = clf_l1_LR.predict(Xtest)
        y2 = clf_l2_LR.predict(Xtest)

        #L1 penalty
        print "L1 Penalty with C=" + str(C)
        print rmsle(ytest, y1)
        print "L2 Penalty with C=" + str(C)
        #L2 penalty
        print rmsle(ytest, y2)

    logreg = LinearRegression()
    logreg.fit(Xtrain, ytrain)
    y3 = logreg.predict(Xtest)
    print "Linear Regression"
    print y3
    print rmsle(ytest, y3)
def logistic_regression(X, y):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y[:nTrain]
  Xtest = X[nTrain:,:]
  ytest = y[nTrain:]

  for i, C in enumerate(10. ** np.arange(1, 6)):
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(Xtrain, ytrain)
    clf_l2_LR.fit(Xtrain, ytrain)


    y1 = clf_l1_LR.predict(Xtest)
    y2 = clf_l2_LR.predict(Xtest)

    #L1 penalty
    print "L1 Penalty with C=" + str(C)
    print rmsle(ytest, y1)
    print "L2 Penalty with C=" + str(C)
    #L2 penalty
    print rmsle(ytest, y2)

  logreg = LinearRegression()
  logreg.fit(Xtrain, ytrain)
  y3 = logreg.predict(Xtest)
  print "Linear Regression"
  print y3
  print rmsle(ytest,y3)
예제 #5
0
	Xtrain = X[:nTrain,:]
	y_casual_train = y_casual[:nTrain]
	y_regis_train = y_regis[:nTrain]
	y_total_train = y_total[:nTrain]
	Xtest = X[nTrain:,:]
	y_casual_test = y_casual[nTrain:]
	y_regis_test = y_regis[nTrain:]
	y_total_test = y_total[nTrain:]
	
	#linear
	#param_grid = {'C': [1, 5, 10, 100],}
	#clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)
	#clf = SVC(kernel='poly')
	#clf.fit(Xtrain,ytrain)
	#pred = clf.predict(Xtest)
	#print "best estimator = ",clf.best_estimator_
	#print "RMSE poly = ", rmsle(ytest, pred)

	#new stuff
	clf_regis = SVR(kernel='poly')
	clf_regis.fit(Xtrain,y_regis_train)
	pred_regis = clf_regis.predict(Xtest)
	
	clf_casual = SVR(kernel='poly')
	clf_casual.fit(Xtrain,y_casual_train)
	pred_casual = clf_casual.predict(Xtest)

	pred_total = pred_casual + pred_regis
	print "RMSLE poly total = ", rmsle(y_total_test, pred_total)
	
예제 #6
0
def decision_tree(X, y1, y2, y3):
    n, _ = X.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    Xtrain = X[:nTrain, :]
    ytrain = y1[:nTrain]
    ytrain_registered = y2[:nTrain]
    ytest_registered = y2[nTrain:]
    ytrain_casual = y3[:nTrain]
    ytest_casual = y3[nTrain:]
    Xtest = X[nTrain:, :]
    ytest = y1[nTrain:]

    #regular

    clf_1 = DecisionTreeRegressor(max_depth=None)
    clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                              n_estimators=500)
    clf_4 = RandomForestRegressor(n_estimators=500,
                                  max_depth=None,
                                  min_samples_split=1,
                                  random_state=0)
    clf_5 = ExtraTreesRegressor(n_estimators=500,
                                max_depth=None,
                                min_samples_split=1,
                                random_state=0)
    clf_3 = GradientBoostingRegressor(n_estimators=500,
                                      max_depth=None,
                                      random_state=0)

    print "finished generating tree"

    clf_1.fit(Xtrain, ytrain_registered)
    clf_2.fit(Xtrain, ytrain_registered)
    clf_3.fit(Xtrain, ytrain_registered)
    clf_4.fit(Xtrain, ytrain_registered)
    clf_5.fit(Xtrain, ytrain_registered)

    print 'Finished fitting'

    dt_regular = clf_1.predict(Xtest)
    ada_regular = clf_2.predict(Xtest)
    grad_regular = clf_3.predict(Xtest)
    rf_regular = clf_4.predict(Xtest)
    et_regular = clf_5.predict(Xtest)

    #casual
    print "finished generating tree"

    clf_1.fit(Xtrain, ytrain_casual)
    clf_2.fit(Xtrain, ytrain_casual)
    clf_3.fit(Xtrain, ytrain_casual)
    clf_4.fit(Xtrain, ytrain_casual)
    clf_5.fit(Xtrain, ytrain_casual)

    print 'Finished fitting'

    dt_casual = clf_1.predict(Xtest)
    ada_casual = clf_2.predict(Xtest)
    grad_casual = clf_3.predict(Xtest)
    rf_casual = clf_4.predict(Xtest)
    et_casual = clf_5.predict(Xtest)
    feature_imps = clf_4.feature_importances_

    print "regular decision tree"
    print rmsle(ytest, dt_regular + dt_casual)
    print "boosted decision tree"
    print rmsle(ytest, ada_regular + ada_casual)
    print "gradient tree boosting"
    print rmsle(ytest, grad_regular + grad_casual)
    print "random forest classifier"
    print rmsle(ytest, rf_regular + rf_casual)
    print "extra trees classifier"
    print rmsle(ytest, et_casual + et_regular)

    print "feature importances"
    print feature_imps
예제 #7
0
	Xtest = X[nTrain:,:]
	y_casual_test = y_casual[nTrain:]
	y_regis_test = y_regis[nTrain:]
	y_total_test = y_total[nTrain:]

	
	#linear
	#param_grid = {'C': [1, 5, 10, 100],}
	#clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)

	clf_regis = SVR(kernel='linear')
	clf_regis.fit(Xtrain,y_regis_train)
	pred_regis = clf_regis.predict(Xtest)
	#print "best estimator = ",clf.best_estimator_
	#print "RMSLE linear registered = ", rmsle(y_regis_test, pred_regis)
	
	clf_casual = SVR(kernel='linear')
	clf_casual.fit(Xtrain,y_casual_train)
	pred_casual = clf_casual.predict(Xtest)

	pred_total = pred_casual + pred_regis

	print len(y_total_test)
	print len(pred_total)
	# if y_total is None:
	# 	print "y is none"

	# if pred_total is None:
	# 	print "pred is None"
	print "RMSLE linear total = ", rmsle(y_total_test, pred_total)
	#np.random.seed(42)
	#np.random.shuffle(idx)
	#X = X[idx]
	#y = y[idx]

	Xtrain = X[:nTrain,:]
	y_casual_train = y_casual[:nTrain]
	y_regis_train = y_regis[:nTrain]
	y_total_train = y_total[:nTrain]
	Xtest = X[nTrain:,:]
	y_casual_test = y_casual[nTrain:]
	y_regis_test = y_regis[nTrain:]
	y_total_test = y_total[nTrain:]
	
	neighbors = 4
	#linear
	#param_grid = {'C': [1, 5, 10, 100],}
	#clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)


	clf_regis = KNeighborsRegressor(n_neighbors=neighbors,algorithm='kd_tree',leaf_size=70,p=1)
	clf_regis.fit(Xtrain,y_regis_train)
	pred_regis = clf_regis.predict(Xtest)
	
	clf_casual = KNeighborsRegressor(n_neighbors=neighbors,algorithm='kd_tree',leaf_size=70,p=1)
	clf_casual.fit(Xtrain,y_casual_train)
	pred_casual = clf_casual.predict(Xtest)

	pred_total = pred_casual + pred_regis
	print "RMSLE sigmoid total = ", rmsle(y_total_test, pred_total)	
예제 #9
0
def decision_tree(X, y1, y2, y3):
    n, _ = X.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    Xtrain = X[:nTrain, :]
    ytrain = y1[:nTrain]
    ytrain_registered = y2[:nTrain]
    ytest_registered = y2[nTrain:]
    ytrain_casual = y3[:nTrain]
    ytest_casual = y3[nTrain:]
    Xtest = X[nTrain:, :]
    ytest = y1[nTrain:]

    #regular

    #clf_1 = DecisionTreeRegressor(max_depth=None)
    #clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
    #n_estimators=500)
    clf_4 = RandomForestRegressor(bootstrap=True,
                                  compute_importances=None,
                                  criterion='mse',
                                  max_depth=None,
                                  max_features='auto',
                                  min_density=None,
                                  min_samples_leaf=2,
                                  min_samples_split=2,
                                  n_estimators=2000,
                                  n_jobs=1,
                                  oob_score=True,
                                  random_state=None,
                                  verbose=0)
    #clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None,
    #min_samples_split=1, random_state=0)
    #clf_3 = GradientBoostingRegressor(n_estimators=500,
    #max_depth=None, random_state=0)

    #rmsele_scorer = make_scorer(rmsle, greater_is_better=False)

    #tuned_parameters = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth': [5, 8, 12], 'min_samples_leaf': [2, 5, 10]}]

    # rf_registered = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(Xtrain, ytrain_registered)
    # rf_casual = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(Xtrain, ytrain_casual)

    print "Best parameters"
    # print rf_registered.best_estimator_
    # print rf_casual.best_estimator_
    clf_4.fit(Xtrain, ytrain)
    rf_total = clf_4.predict(Xtest)
    rf_ytrain = clf_4.predict(Xtrain)
    print "finished generating regressor"

    #clf_1.fit(Xtrain, ytrain_registered)
    #clf_2.fit(Xtrain, ytrain_registered)
    #clf_3.fit(Xtrain, ytrain_registered)
    clf_4.fit(Xtrain, ytrain_registered)
    #clf_5.fit(Xtrain, ytrain_registered)

    print 'Finished fitting'

    #dt_regular = clf_1.predict(Xtest)
    #ada_regular = clf_2.predict(Xtest)
    #grad_regular = clf_3.predict(Xtest)
    rf_regular = clf_4.predict(Xtest)
    #et_regular = clf_5.predict(Xtest)

    #casual
    print "finished generating tree"

    #clf_1.fit(Xtrain, ytrain_casual)
    #clf_2.fit(Xtrain, ytrain_casual)
    #clf_3.fit(Xtrain, ytrain_casual)
    clf_4.fit(Xtrain, ytrain_casual)
    #clf_5.fit(Xtrain, ytrain_casual)

    print 'Finished fitting'

    #dt_casual = clf_1.predict(Xtest)
    #ada_casual = clf_2.predict(Xtest)
    #grad_casual = clf_3.predict(Xtest)
    rf_casual = clf_4.predict(Xtest)
    # #et_casual = clf_5.predict(Xtest)
    # feature_imps = clf_4.feature_importances_

    # print "regular decision tree"
    # print rmsle(ytest, dt_regular + dt_casual)
    # print "boosted decision tree"
    # print rmsle(ytest, ada_regular + ada_casual)
    # print "gradient tree boosting"
    # print rmsle(ytest, grad_regular + grad_casual)
    print "random forest classifier"
    print rmsle(ytest, rf_regular + rf_casual)
    print rmsle(ytest, rf_total)
    print rmsle(ytrain, rf_ytrain)
    # print "extra trees classifier"
    # print rmsle(ytest, et_casual + et_regular)

    print "feature importances"
예제 #10
0
    y_casual_train = y_casual[:nTrain]
    y_regis_train = y_regis[:nTrain]
    y_total_train = y_total[:nTrain]
    Xtest = X[nTrain:, :]
    y_casual_test = y_casual[nTrain:]
    y_regis_test = y_regis[nTrain:]
    y_total_test = y_total[nTrain:]
    '''
	#rbf
	param_grid = {'C': [1, 5, 10, 100],'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1],}
	#clf = GridSearchCV(SVC(kernel='rbf'), param_grid,n_jobs=-1)
	clf = SVC(kernel='rbf',C=5.0,gamma=0.0001)
	clf.fit(Xtrain,ytrain)
	pred = clf.predict(Xtest)
	
	print "best estimator = ",clf.best_estimator_
	print "RMSE rbf = ", rmsle(ytest, pred)
	#print classification_report(ytest, pred)
	'''
    #new stuff
    clf_regis = SVR(kernel='rbf')
    clf_regis.fit(Xtrain, y_regis_train)
    pred_regis = clf_regis.predict(Xtest)

    clf_casual = SVR(kernel='rbf')
    clf_casual.fit(Xtrain, y_casual_train)
    pred_casual = clf_casual.predict(Xtest)

    pred_total = pred_casual + pred_regis
    print "RMSLE rbf total = ", rmsle(y_total_test, pred_total)
예제 #11
0
def decision_tree(X, y1, y2, y3):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y1[:nTrain]
  ytrain_registered = y2[:nTrain]
  ytest_registered = y2[nTrain:]
  ytrain_casual = y3[:nTrain]
  ytest_casual = y3[nTrain:]
  Xtest = X[nTrain:,:]
  ytest = y1[nTrain:]

  #regular

  #clf_1 = DecisionTreeRegressor(max_depth=None)
  #clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                          #n_estimators=500)
  clf_4 = RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           min_density=None, min_samples_leaf=2, min_samples_split=2,
           n_estimators=2000, n_jobs=1, oob_score=True, random_state=None,
           verbose=0)
  #clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None,
                          #min_samples_split=1, random_state=0)
  #clf_3 = GradientBoostingRegressor(n_estimators=500,
                          #max_depth=None, random_state=0)

  #rmsele_scorer = make_scorer(rmsle, greater_is_better=False)

  #tuned_parameters = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth': [5, 8, 12], 'min_samples_leaf': [2, 5, 10]}]

  # rf_registered = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(Xtrain, ytrain_registered)
  # rf_casual = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(Xtrain, ytrain_casual)

  print "Best parameters"
  # print rf_registered.best_estimator_
  # print rf_casual.best_estimator_
  clf_4.fit(Xtrain, ytrain)
  rf_total = clf_4.predict(Xtest)
  rf_ytrain = clf_4.predict(Xtrain)
  print "finished generating regressor"

  #clf_1.fit(Xtrain, ytrain_registered)
  #clf_2.fit(Xtrain, ytrain_registered)
  #clf_3.fit(Xtrain, ytrain_registered)
  clf_4.fit(Xtrain, ytrain_registered)
  #clf_5.fit(Xtrain, ytrain_registered)


  print 'Finished fitting'


  #dt_regular = clf_1.predict(Xtest)
  #ada_regular = clf_2.predict(Xtest)
  #grad_regular = clf_3.predict(Xtest)
  rf_regular = clf_4.predict(Xtest)
  #et_regular = clf_5.predict(Xtest)

  #casual
  print "finished generating tree"

  #clf_1.fit(Xtrain, ytrain_casual)
  #clf_2.fit(Xtrain, ytrain_casual)
  #clf_3.fit(Xtrain, ytrain_casual)
  clf_4.fit(Xtrain, ytrain_casual)
  #clf_5.fit(Xtrain, ytrain_casual)


  print 'Finished fitting'


  #dt_casual = clf_1.predict(Xtest)
  #ada_casual = clf_2.predict(Xtest)
  #grad_casual = clf_3.predict(Xtest)
  rf_casual = clf_4.predict(Xtest)
  # #et_casual = clf_5.predict(Xtest)
  # feature_imps = clf_4.feature_importances_

  # print "regular decision tree"
  # print rmsle(ytest, dt_regular + dt_casual)
  # print "boosted decision tree"
  # print rmsle(ytest, ada_regular + ada_casual)
  # print "gradient tree boosting"
  # print rmsle(ytest, grad_regular + grad_casual)
  print "random forest classifier"
  print rmsle(ytest, rf_regular + rf_casual)
  print rmsle(ytest, rf_total)
  print rmsle(ytrain, rf_ytrain)
  # print "extra trees classifier"
  # print rmsle(ytest, et_casual + et_regular)

  print "feature importances"
예제 #12
0
    Xtrain = X[:nTrain, :]
    y_casual_train = y_casual[:nTrain]
    y_regis_train = y_regis[:nTrain]
    y_total_train = y_total[:nTrain]
    Xtest = X[nTrain:, :]
    y_casual_test = y_casual[nTrain:]
    y_regis_test = y_regis[nTrain:]
    y_total_test = y_total[nTrain:]

    #linear
    #param_grid = {'C': [1, 5, 10, 100],}
    #clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)
    #clf = SVC(kernel='poly')
    #clf.fit(Xtrain,ytrain)
    #pred = clf.predict(Xtest)
    #print "best estimator = ",clf.best_estimator_
    #print "RMSE poly = ", rmsle(ytest, pred)

    #new stuff
    clf_regis = SVR(kernel='poly')
    clf_regis.fit(Xtrain, y_regis_train)
    pred_regis = clf_regis.predict(Xtest)

    clf_casual = SVR(kernel='poly')
    clf_casual.fit(Xtrain, y_casual_train)
    pred_casual = clf_casual.predict(Xtest)

    pred_total = pred_casual + pred_regis
    print "RMSLE poly total = ", rmsle(y_total_test, pred_total)
예제 #13
0
	y_regis_train = y_regis[:nTrain]
	y_total_train = y_total[:nTrain]
	Xtest = X[nTrain:,:]
	y_casual_test = y_casual[nTrain:]
	y_regis_test = y_regis[nTrain:]
	y_total_test = y_total[nTrain:]

	'''
	#rbf
	param_grid = {'C': [1, 5, 10, 100],'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1],}
	#clf = GridSearchCV(SVC(kernel='rbf'), param_grid,n_jobs=-1)
	clf = SVC(kernel='rbf',C=5.0,gamma=0.0001)
	clf.fit(Xtrain,ytrain)
	pred = clf.predict(Xtest)
	
	print "best estimator = ",clf.best_estimator_
	print "RMSE rbf = ", rmsle(ytest, pred)
	#print classification_report(ytest, pred)
	'''
	#new stuff
	clf_regis = SVR(kernel='rbf')
	clf_regis.fit(Xtrain,y_regis_train)
	pred_regis = clf_regis.predict(Xtest)
	
	clf_casual = SVR(kernel='rbf')
	clf_casual.fit(Xtrain,y_casual_train)
	pred_casual = clf_casual.predict(Xtest)

	pred_total = pred_casual + pred_regis
	print "RMSLE rbf total = ", rmsle(y_total_test, pred_total)
예제 #14
0
    Xtrain = X[:nTrain, :]
    y_casual_train = y_casual[:nTrain]
    y_regis_train = y_regis[:nTrain]
    y_total_train = y_total[:nTrain]
    Xtest = X[nTrain:, :]
    y_casual_test = y_casual[nTrain:]
    y_regis_test = y_regis[nTrain:]
    y_total_test = y_total[nTrain:]

    neighbors = 4
    #linear
    #param_grid = {'C': [1, 5, 10, 100],}
    #clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)

    clf_regis = KNeighborsRegressor(n_neighbors=neighbors,
                                    algorithm='kd_tree',
                                    leaf_size=70,
                                    p=1)
    clf_regis.fit(Xtrain, y_regis_train)
    pred_regis = clf_regis.predict(Xtest)

    clf_casual = KNeighborsRegressor(n_neighbors=neighbors,
                                     algorithm='kd_tree',
                                     leaf_size=70,
                                     p=1)
    clf_casual.fit(Xtrain, y_casual_train)
    pred_casual = clf_casual.predict(Xtest)

    pred_total = pred_casual + pred_regis
    print "RMSLE sigmoid total = ", rmsle(y_total_test, pred_total)
예제 #15
0
    y_total_train = y_total[:nTrain]
    Xtest = X[nTrain:, :]
    y_casual_test = y_casual[nTrain:]
    y_regis_test = y_regis[nTrain:]
    y_total_test = y_total[nTrain:]

    #linear
    #param_grid = {'C': [1, 5, 10, 100],}
    #clf = GridSearchCV(SVC(kernel='linear'), param_grid,n_jobs=-1)

    clf_regis = SVR(kernel='linear')
    clf_regis.fit(Xtrain, y_regis_train)
    pred_regis = clf_regis.predict(Xtest)
    #print "best estimator = ",clf.best_estimator_
    #print "RMSLE linear registered = ", rmsle(y_regis_test, pred_regis)

    clf_casual = SVR(kernel='linear')
    clf_casual.fit(Xtrain, y_casual_train)
    pred_casual = clf_casual.predict(Xtest)

    pred_total = pred_casual + pred_regis

    print len(y_total_test)
    print len(pred_total)
    # if y_total is None:
    # 	print "y is none"

    # if pred_total is None:
    # 	print "pred is None"
    print "RMSLE linear total = ", rmsle(y_total_test, pred_total)
예제 #16
0
def decision_tree(X, y1, y2, y3):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y1[:nTrain]
  ytrain_registered = y2[:nTrain]
  ytest_registered = y2[nTrain:]
  ytrain_casual = y3[:nTrain]
  ytest_casual = y3[nTrain:]
  Xtest = X[nTrain:,:]
  ytest = y1[nTrain:]

  #regular

  clf_1 = DecisionTreeRegressor(max_depth=None)
  clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                          n_estimators=500)
  clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_3 = GradientBoostingRegressor(n_estimators=500,
                          max_depth=None, random_state=0)

  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_registered)
  clf_2.fit(Xtrain, ytrain_registered)
  clf_3.fit(Xtrain, ytrain_registered)
  clf_4.fit(Xtrain, ytrain_registered)
  clf_5.fit(Xtrain, ytrain_registered)


  print 'Finished fitting'


  dt_regular = clf_1.predict(Xtest)
  ada_regular = clf_2.predict(Xtest)
  grad_regular = clf_3.predict(Xtest)
  rf_regular = clf_4.predict(Xtest)
  et_regular = clf_5.predict(Xtest)

  #casual
  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_casual)
  clf_2.fit(Xtrain, ytrain_casual)
  clf_3.fit(Xtrain, ytrain_casual)
  clf_4.fit(Xtrain, ytrain_casual)
  clf_5.fit(Xtrain, ytrain_casual)


  print 'Finished fitting'


  dt_casual = clf_1.predict(Xtest)
  ada_casual = clf_2.predict(Xtest)
  grad_casual = clf_3.predict(Xtest)
  rf_casual = clf_4.predict(Xtest)
  et_casual = clf_5.predict(Xtest)
  feature_imps = clf_4.feature_importances_

  print "regular decision tree"
  print rmsle(ytest, dt_regular + dt_casual)
  print "boosted decision tree"
  print rmsle(ytest, ada_regular + ada_casual)
  print "gradient tree boosting"
  print rmsle(ytest, grad_regular + grad_casual)
  print "random forest classifier"
  print rmsle(ytest, rf_regular + rf_casual)
  print "extra trees classifier"
  print rmsle(ytest, et_casual + et_regular)

  print "feature importances"
  print feature_imps
예제 #17
0
	#np.random.seed(42)
	#np.random.shuffle(idx)
	#y = y[idx]
	#X = X[idx]

	# split the data
	Xtrain = X[:nTrain,:]
	ytrain = y[:nTrain]
	Xtest = X[nTrain:,:]
	ytest = y[nTrain:]
	
	#linear
	clf = SVC(kernel='linear')
	clf.fit(Xtrain,ytrain)
	pred = clf.predict(Xtest)
	print "RMSE linear = ", rmsle(ytest, pred)

	#polynomial
	clf = SVC(kernel='poly')
	clf.fit(Xtrain,ytrain)
	pred = clf.predict(Xtest)
	print "RMSE poly = ", rmsle(ytest, pred)

	#rbf
	clf = SVC(kernel='rbf')
	clf.fit(Xtrain,ytrain)
	pred = clf.predict(Xtest)
	print "RMSE rbf = ", rmsle(ytest, pred)

	#sigmoid
	clf = SVC(kernel='sigmoid')