예제 #1
1
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100)
        #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 )
        #self.clf_Regression = LinearRegression()
        

    def fit(self, X, y):
        self.clf.fit(X,y)

    def predict(self, X):
        return self.clf.predict(X)
예제 #2
0
파일: x.py 프로젝트: shenbai/tradesafe
def backTest(trainEndDate, code, testDate, predictDate):
    conn = db.get_history_data_db('D')
    df = None
    # train more date
    # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r'))
    rng = np.random.RandomState(1)
    model = AdaBoostRegressor(DecisionTreeRegressor(
        max_depth=4), n_estimators=1000, random_state=rng, loss='square')
    df = pd.read_sql_query(
        "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % (
            trainEndDate, code), conn)
    shift_1 = df['close'].shift(-2)
    df['target'] = shift_1
    data = df[df['target'] > -1000]

    X_train = data.ix[:, 'code':'turnover']
    y_train = data.ix[:, 'target']
    if len(X_train) < 500:
        return
    print len(X_train)
    # print data
    # for i in range(0, 10):
    #     model.fit(X_train, y_train)
    model.fit(X_train, y_train)
    # predict tomorrow
    try:
        df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn)
        # print df
    except Exception, e:
        print e
예제 #3
0
def round2(X_df, featurelist):
    # Set parameters
    model = AdaBoostRegressor()
    y_df = X_df['target']
    n = len(y_df)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :]
        # y_train, y_test = y_df[train_idx], y_df[test_idx]

        X_train, X_test = applyFeatures(X_train, X_test, featurelist)
        Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test)
        model.fit(Xtrain_array, ytrain_array)
        prediction = model.predict(Xtest_array)
        rmse = np.sqrt(mean_squared_error(ytest_array, prediction))
        scores.append(rmse)
        print rmse
        print "Finish fold"

    return scores
예제 #4
0
파일: predictor.py 프로젝트: atremblay/MLND
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_ada = [{
            'n_estimators': [25, 50, 100],
            'learning_rate': [0.01, 0.1, 1, 10],
            'loss': ['linear', 'square', 'exponential']
            }]

        params = ParameterGrid(params_ada)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                ada = AdaBoostRegressor(**param)
                ada.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, ada.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = ada

        return self
예제 #5
0
    def predict(tour_data):

        vec = DictVectorizer()

        tour_data = get_tour_data()

        transformed = vec.fit_transform(tour_data).toarray()
        categories = vec.get_feature_names()

        y = transformed[:,[categories.index('rating')]]
        X = transformed[:,np.arange(transformed.shape[1])!=categories.index('rating')]

        reg_tree = DecisionTreeRegressor()

        addboost_tree = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                              n_estimators=300, random_state=rng)

        red_tree.fit(X,y)
        addboost_tree(X,y)

        # Predict
        y_1 = red_tree.predict(X)
        y_2 = addboost_tree.predict(X)

        return prediction
예제 #6
0
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the model for validation curve
    :param l_curve: run the model for learning curve
    :param get_model: run the model
    :return: the predicted values,learning curve, validation curve
    """
    ada = AdaBoostRegressor(n_estimators=5)
    if get_model:
        print "Fitting Ada..."
        ada.fit(train_x, np.log(train_y+1))
        ada_pred = np.exp(ada.predict(pred_x))-1
        Votes = ada_pred[:,np.newaxis]
        Id = np.array(review_id)[:,np.newaxis]
        # create submission csv for Kaggle
        submission_ada= np.concatenate((Id,Votes),axis=1)
        np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if l_curve:
        print "Working on Learning Curves"
        plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0))
    if v_curve:
        print "Working on Validation Curves"
        plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0),
                              param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
예제 #7
0
def Round2(X, y):
    # Set parameters
    min_score = {}
    for loss in ['linear', 'square', 'exponential']:
        model = AdaBoostRegressor(loss=loss)
        n = len(y)

        # Perform 5-fold cross validation
        scores = []
        kf = KFold(n, n_folds=5, shuffle=True)

        # Calculate mean absolute deviation for train/test for each fold
        for train_idx, test_idx in kf:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, prediction))
            # score = model.score(X_test, y_test)
            scores.append(rmse)
        if len(min_score) == 0:
            min_score['loss'] = loss
            min_score['scores'] = scores
        else:
            if np.mean(scores) < np.mean(min_score['scores']):
                min_score['loss'] = loss
                min_score['scores'] = scores

        print "Loss:", loss
        print scores
        print np.mean(scores)
    return min_score
예제 #8
0
def main():



    ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, 
                            learning_rate=1.0, loss='exponential', 
                            random_state=None)  

    ab.fit(X_train, y_train)

    #Evaluation in train set
    #Evaluation in train set
    pred_proba_train = ab.predict(X_train)
        
    mse_train = mean_squared_error(y_train, pred_proba_train)
    rmse_train = np.sqrt(mse_train)
    logloss_train = log_loss(y_train, pred_proba_train)
    
    #Evaluation in validation set
    pred_proba_val = ab.predict(X_val)
        
    mse_val = mean_squared_error(y_val, pred_proba_val)
    rmse_val = np.sqrt(mse_val)
    logloss_val = log_loss(y_val, pred_proba_val)
    
    rmse_train
    rmse_val
    logloss_train
    logloss_val
예제 #9
0
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
def predict_volatility_1year_ahead(rows, day, num_days):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    '''enforce that `day` is in the required range'''
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    '''Compile features for fitting'''
    feature_sets = []
    value_sets = [];
    for ii in range(day+num_days+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [
        	float(rows[day_index][7]), 
        	float(rows[day_index][8]),
        	float(rows[day_index][9]), 
        	float(rows[day_index][10]),
        	float(rows[day_index][11]),
        	float(rows[day_index][12]),
        	float(rows[day_index][13]),
            ]
            #print("issue here: " + str(rows[day_index][0]))
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]

    '''Create Regressor and fit'''
    num_features = 16
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng)
    regr.fit(feature_sets, value_sets)

    '''Get prediction features'''
    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj   
        features += [
        float(rows[day_index][7]), 
        float(rows[day_index][8]),
        float(rows[day_index][9]), 
        float(rows[day_index][10]),
        float(rows[day_index][11]),
        float(rows[day_index][12]),
        float(rows[day_index][13]),
        ]
        
    return float(regr.predict([features]))
예제 #11
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
예제 #12
0
def ada_learning(labels, train, test):
    label_log=np.log1p(labels)
    # try 50 / 1.0
    #boost GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1)
    clf=AdaBoostRegressor(GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.1),n_estimators=50, learning_rate=1.0)
    model=clf.fit(train, label_log)
    preds1=model.predict(test)
    preds=np.expm1(preds1)
    return  preds
예제 #13
0
파일: ada_reg.py 프로젝트: HouJP/tianyi-16
def train_predict(train_id, test_id):
	# load libsvm files for training dataset
	Xs_train = []
	ys_train = []
	n_train = load_libsvm_files(train_id, Xs_train, ys_train)
	# load libsvm files for testing dataset
	Xs_test = []
	ys_test = []
	n_test = load_libsvm_files(test_id, Xs_test, ys_test)

	# models
	model = []

	# ans
	ans_train = []
	ans_test = []

	# generate predictions for training dataset
	ps_train = []
	for i in range(0, n_train):
		ps_train.append([0.0 for j in range(10)])

	# generate predictions for testing dataset
	ps_test = []
	for i in range(0, n_test):
		ps_test.append([0.0 for j in range(10)])

	# fit models
	for i in range(10):
		l = np.array([ys_train[j][i] for j in range(n_train)])
		clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth']), n_estimators=params['n_estimators'], learning_rate=params['learning_rate'])
		clf.fit(Xs_train[i].toarray(), l)
		print "[%s] [INFO] %d model training done" % (t_now(), i)
		preds_train = clf.staged_predict(Xs_train[i].toarray())
		ans_train.append([item for item in preds_train])
		# print "len(ans_train[%d]) = %d" % (i, len(ans_train[i]))
		print "[%s] [INFO] %d model predict for training data set done" % (t_now(), i)
		preds_test = clf.staged_predict(Xs_test[i].toarray())
		ans_test.append([item for item in preds_test])
		print "[%s] [INFO] %d model predict for testing data set done" % (t_now(), i)

	#print "len_ans_train=%d" % len(ans_train[0])

	# predict for testing data set
	for i in range(params['n_estimators']):
		for j in range(10):
			tmp = min(i, len(ans_train[j]) - 1)
			for k in range(n_train):
				ps_train[k][j] = ans_train[j][tmp][k]
			tmp = min(i, len(ans_test[j]) - 1)
			for k in range(n_test):
				ps_test[k][j] = ans_test[j][tmp][k]
		print "%s,%d,%f,%f" % (t_now(), i + 1, mean_cos_similarity(ys_train, ps_train, n_train), mean_cos_similarity(ys_test, ps_test, n_test))

	return 0
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum):
	rms = dict()
	for trees in treeNum:
		ab = AdaBoostRegressor(n_estimators = trees)
		ab.fit(xTrain, yTrain)
		yPred = ab.predict(xTest)
		rms[trees] = sqrt(mean_squared_error(yTest, yPred))

	(bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0]

	return bestRegressor, rmse
예제 #15
0
def ada_boost(data,classifier,sample):
    from sklearn.ensemble import AdaBoostRegressor
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.cluster import KMeans
    from sklearn.naive_bayes import GaussianNB
    func = GaussianNB()
    func = DecisionTreeRegressor()
    func = KMeans(n_clusters=2)
    clf = AdaBoostRegressor(func,n_estimators=300,random_state=random.RandomState(1))
    clf.fit(data,classifier)
    print_result(clf,[sample])
예제 #16
0
class Regressor(BaseEstimator):
    def __init__(self):
        cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10)
        self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
#RandomForestClassifier
def test_boston():
    # Check consistency on dataset boston house prices.
    reg = AdaBoostRegressor(random_state=0)
    reg.fit(boston.data, boston.target)
    score = reg.score(boston.data, boston.target)
    assert score > 0.85

    # Check we used multiple estimators
    assert len(reg.estimators_) > 1
    # Check for distinct random states (see issue #7408)
    assert_equal(len(set(est.random_state for est in reg.estimators_)),
                 len(reg.estimators_))
예제 #18
0
def test_sparse_regression():
    """Check regression with sparse input."""

    class CustomSVR(SVR):
        """SVR variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_regression(n_samples=100, n_features=50, n_targets=1,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = dense_results = AdaBoostRegressor(
            base_estimator=CustomSVR(probability=True),
            random_state=1
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        sparse_type = type(X_train_sparse)
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
예제 #19
0
def performAdaBoostReg(train, test, features, output):
    """
    Ada Boost Regression
    """

    clf = AdaBoostRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
예제 #20
0
def do_adaboost(filename):
    df, Y = create_merged_dataset(filename)
    # Ideas:
    # Create a feature for accelerations e deacceleration.

    # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results.
    #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05)
    ada = AdaBoostRegressor(n_estimators=500, learning_rate=1)
    
    #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1)
    X = df.drop(['driver', 'trip'], 1)
    ada.fit(X, Y)
    probs = ada.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def predict_volatility_1year_ahead(rows, day):
    """
    SUMMARY: Predict volatility 1 year into the future
    ALGORITHM:
      a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day`
      b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction
         i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
              if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19)
    INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0)
  
    """

    #num_days = 10
    num_days = 10

    # enforce that `day` is in the required range
    assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days
    assert day >= 0

    # compile features (X) and values (Y) 
    feature_sets = []
    value_sets = []; value_sets_index = []
    for ii in range(day+252, len(rows) - num_days):
        features = []
        for jj in range(num_days):
            day_index = ii + jj
            features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]
        feature_sets += [features]
        value_sets += [float(rows[ii-252][9])]
        value_sets_index.append([ii-252])
             
    # fit
    #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000)   # they call lambda alpha
    rng = np.random.RandomState(1)
    regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng)
    #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng)
    #regr =  DecisionTreeRegressor(max_depth=4)
    regr.fit(feature_sets, value_sets)
    

    #print "Adaboost weights:", regr.estimator_weights_

    ii = day
    features = []
    for jj in range( num_days ):
        day_index = ii + jj    +252    
        features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])]

    return float(regr.predict([features]))
def test_sample_weight_adaboost_regressor():
    """
    AdaBoostRegressor should work without sample_weights in the base estimator
    The random weighted sampling is done internally in the _boost method in
    AdaBoostRegressor.
    """
    class DummyEstimator(BaseEstimator):

        def fit(self, X, y):
            pass

        def predict(self, X):
            return np.zeros(X.shape[0])

    boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
    boost.fit(X, y_regr)
    assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))
예제 #23
0
    def initGrid(X,y):
        min_samples_split = [2,4,6,8]
        max_depth = [2,4,6,8]
        n_estimators=[50,100,150]
        bootstrap=[False, True]
        min_samples_leaf=[2,4,6,8]

        grid = {
            'min_samples_split':min_samples_split,
            'max_depth': max_depth,
            'min_samples_leaf':min_samples_leaf
        }
        model = DecisionTreeRegressor();
        gs = GridSearchCV(estimator=model, param_grid=grid,  verbose=10, n_jobs=-1)
        gs.fit(X,y)
        print(gs.best_params_)
        search = AdaBoostRegressor(gs)
        search.fit(X,y)
        return search
예제 #24
0
파일: code.py 프로젝트: nilichen/ML4Trading
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
예제 #25
0
파일: calc_svm_imf.py 프로젝트: fndjjx/emd
def svm_smooth(data, residual_imf, period):
    train_data = []
    lable = []
    for i in range(period,len(residual_imf)-20):
        tmp = data[i-period:i+1]
        train_data.append(tmp)
        lable.append(residual_imf[i])

    rng = np.random.RandomState(1)
    clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng)
    clf.fit(train_data, lable) 
    smooth_data = []
    for i in range(len(data)):
        if i<=period:
            smooth_data.append(data[i])
        else:
            smooth_data.append(clf.predict([data[i-period:i+1]])[0])

    return smooth_data
def run_tree_regressor():
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import train_test_split
    import numpy as np
    from sklearn.ensemble import AdaBoostRegressor
 
    print "running me"
    X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file
    Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") 
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
     
    rng = np.random.RandomState(1)
 
    depth = 35 # current lowest
    for estimators in [130,235,300,345,450]:
        treeAdaBoost =  AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng)
        treeAdaBoost.fit(x_train, y_train)
        print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
예제 #27
0
def round1(X, y):
    # Set parameters
    model = AdaBoostRegressor()
    n = len(y)

    # Perform 5-fold cross validation
    scores = []
    kf = KFold(n, n_folds=5, shuffle=True)

    # Calculate mean absolute deviation for train/test for each fold
    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, prediction))
        scores.append(rmse)

    return scores
    def __init__(self, isTrain):
        super(RegressionUniformBlending, self).__init__(isTrain)
        # data preprocessing
        #self.dataPreprocessing()

        self.net1 = NeuralNet(
                        layers=[  # three layers: one hidden layer
                            ('input', layers.InputLayer),
                            ('hidden', layers.DenseLayer),
                            #('hidden2', layers.DenseLayer),
                            #('hidden3', layers.DenseLayer),
                            ('output', layers.DenseLayer),
                            ],
                        # layer parameters:
                        input_shape=(None, 13),  # input dimension is 13
                        hidden_num_units=6,  # number of units in hidden layer
                        #hidden2_num_units=8,  # number of units in hidden layer
                        #hidden3_num_units=4,  # number of units in hidden layer
                        output_nonlinearity=None,  # output layer uses sigmoid function
                        output_num_units=1,  # output dimension is 1

                        # obejctive function
                        objective_loss_function = lasagne.objectives.squared_error,

                        # optimization method:
                        update=lasagne.updates.nesterov_momentum,
                        update_learning_rate=0.002,
                        update_momentum=0.4,

                        # use 25% as validation
                        train_split=TrainSplit(eval_size=0.2),

                        regression=True,  # flag to indicate we're dealing with regression problem
                        max_epochs=100,  # we want to train this many epochs
                        verbose=0,
                        )

        # Create linear regression object
        self.linRegr = linear_model.LinearRegression()

        # Create KNN regression object
        self.knn = neighbors.KNeighborsRegressor(86, weights='distance')

        # Create Decision Tree regression object
        self.decisionTree = DecisionTreeRegressor(max_depth=7, max_features=None)

        # Create AdaBoost regression object
        decisionReg = DecisionTreeRegressor(max_depth=10)
        rng = np.random.RandomState(1)
        self.adaReg = AdaBoostRegressor(decisionReg,
                          n_estimators=400,
                          random_state=rng)

        # Create linear regression object
        self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
    def __init__(self, isTrain):
        super(RegressionAdaBoost, self).__init__(isTrain)
        # data preprocessing
        #self.dataPreprocessing()

        # Create AdaBoost regression object
        decisionReg = DecisionTreeRegressor(max_depth=10)
        rng = np.random.RandomState(1)
        self.adaReg = AdaBoostRegressor(decisionReg,
                          n_estimators=400,
                          random_state=rng)
예제 #30
0
def xgb_train(x_train, x_label, x_test):
    model = 'xgb'
    #model = 'adaboost'
    #if model.count('xgb') >0:
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005  # [0,1]
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1.0
    params["silent"] = 1
    params["max_depth"] = 9
    if config.nthread > 1:
        params["nthread"] = 1

    num_rounds = 10000

    xgtrain = xgb.DMatrix(x_train, label=x_label)
    xgval = xgb.DMatrix(x_test)

    #train using early stopping and predict
    watchlist = [(xgtrain, "train")]
    #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric)
    model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    pred1 = model.predict( xgval )

    #clf = RandomForestRegressor()
    #clf = LogisticRegression()
    #clf = GradientBoostingRegressor()
    clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 )
    clf.fit(x_train, x_label)
    pred2 = clf.predict(x_test)

    #pred = pred1 * pred2 / (pred1 + pred2)
    #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01)
    #pred = (pred1.argsort() + pred2.argsort()) / 2
    pred = 0.6 * pred1 + 0.4 * pred2

    return pred
dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]

X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42)
y_train = y_train.values.ravel()

models = []
models.append(('SVR', SVR()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('l', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('R', Ridge()))
models.append(('BR', BayesianRidge()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('RF', AdaBoostRegressor()))
models.append(('ET', ExtraTreesRegressor()))
models.append(('BgR', BaggingRegressor()))

scoring = 'neg_mean_squared_error'

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# Clear MORE unused variable to free memory
del globals()['unqLikesUIDs']
del globals()['unqLikesLIDs']
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

# Training Model
###############
print("training started")

nEST = 100
lR = 1.0
adaBoost = AdaBoostRegressor(n_estimators=nEST, learning_rate=lR)
adaBoost.fit(likesMAT, neusARR)

print("training completed")

# Save model
###############
joblib.dump(adaBoost, "/Users/jamster/adaBoost-A-neus.xz", compress=9)
print("model saved to disk")

print("DONE")
    label_list = []
    for i in range(len(Candidates)):
        if output_piece.iloc[i,1] == training.iloc[L,10]:
            label_list.append(1)
        else:
            label_list.append(0)
    
    output_piece["lab"] = label_list
    Output = Output.append(output_piece)

weight = len(Output) / sum(Output.lab)
Output['Weight'] = Output['lab']*weight

from sklearn.ensemble import AdaBoostRegressor

m = AdaBoostRegressor()

X = Output.drop(["We","Wc", "lab", "Weight"],axis=1)
y = Output.lab
m.fit(X,y,sample_weight=Output.Weight)


for L in range(len(test)):
    if test.iloc[L,11] == False:
        We = test.iloc[L,5]
        output_piece = pd.DataFrame()
        for Threshold in range(20):
            Candidates = p4.candidate_search(Dictionary, We, Threshold)
            if len(Candidates) >= 10:
                break
def test_boston():
    """Check consistency on dataset boston house prices."""
    clf = AdaBoostRegressor(random_state=0)
    clf.fit(boston.data, boston.target)
    score = clf.score(boston.data, boston.target)
    assert score > 0.85
예제 #35
0
        model = DecisionTreeRegressor(criterion="mse")
        model_name = "REGRESSION_TREE"
    elif selected_model == Model.RANDOM_FOREST:
        model = RandomForestRegressor(criterion="mse", n_estimators=20, min_samples_split=4, min_weight_fraction_leaf=0.01)
        model_name = "FOREST"
    elif selected_model == Model.EXTRA_TREE_REGRESSOR:
        model = ExtraTreesRegressor(criterion="mse")
        model_name = "EXTRA_TREE_REGRESSOR"
    elif selected_model == Model.GRADIENT_BOOSTING_REGRESSOR:
        model = GradientBoostingRegressor(loss="lad", n_estimators=200)
        model_name = "GRADIENT_BOOSTING_REGRESSOR"
    elif selected_model == Model.BAGGING_REGRESSOR:
        model = BaggingRegressor(oob_score=True)
        model_name = "BAGGING_REGRESSOR"
    elif selected_model == Model.ADABOOST_REGRESSOR:
        model = AdaBoostRegressor(loss="linear")
        model_name = "ADABOOST_REGRESSOR"
    else:
        Support.colored_print("No method selected!", "red")
        sys.exit(0)
    Support.colored_print("Method selected: " + model_name, "green")

    Support.colored_print("Training...", "green")
    t0 = time.time()
    model.fit(X[:train_size], y[:train_size])
    model_fit = time.time() - t0
    print(model_name + " complexity and bandwidth selected and model fitted in %.3f s" % model_fit)
    t0 = time.time()
    y_model = model.predict(X_plot)
    model_predict = time.time() - t0
    print(model_name + " prediction for %d inputs in %.3f s" % (X_plot.shape[0], model_predict))
예제 #36
0
#Applied DecisionTreeRegressor technique and achieved accuracy of 72.76%

from sklearn.tree import DecisionTreeRegressor
tree_ = DecisionTreeRegressor()
tree_.fit(X_train,y_train)
y_pred_2 = tree_.predict(X_test)
print("Accuracy is: "+ str(tree_.score(X_test,y_test) * 100) + "%")
print("Mean Absolute Error: {}".format(mean_absolute_error(y_test,y_pred_2)))
print("Mean Squared Error: {}".format(mean_squared_error(y_test,y_pred_2)))
print("R Squared: {}".format(r2_score(y_test,y_pred_2)))

#Applied AdaBoostRegressor technique and achieved accuracy of 40.82%

from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(loss = "exponential")
ada.fit(X_train,y_train)
y_pred_3 = ada.predict(X_test)
print("Accuracy is: "+ str(ada.score(X_test,y_test) * 100) + "%")
print("Mean Absolute Error: {}".format(mean_absolute_error(y_test,y_pred_3)))
print("Mean Squared Error: {}".format(mean_squared_error(y_test,y_pred_3)))
print("R Squared: {}".format(r2_score(y_test,y_pred_3)))

#Applied XGBRegressor technique and achieved accuracy of 87.05%

from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)
y_pred_8 = xgb.predict(X_test)
print("Accuracy is: "+ str(xgb.score(X_test,y_test) * 100) + "%")
예제 #37
0
파일: benchmark.py 프로젝트: novieq/kaggle
    train)[:, 20], np.array(test)[:, :20], np.array(data_10)[:, :20], np.array(
        data_10)[:, 20],
print 'train', np.array(train).shape
print xtrain[1]
print 'xtrain', xtrain.shape
print 'ytrain', ytrain.shape
print 'test', xtest.shape
estimators = 100
#sup_vec = svm.SVC(C=11000, verbose = 2, probability=True)
#sup_vec = RandomForestRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100)
#sup_vec = ExtraTreesRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100)

#sup_vec =  AdaBoostRegressor(RandomForestRegressor(n_estimators=100, verbose=2, n_jobs = -1),n_estimators=100)
sup_vec = AdaBoostRegressor(ExtraTreesRegressor(n_estimators=100,
                                                verbose=2,
                                                n_jobs=-1),
                            n_estimators=160,
                            loss='exponential')

#sup_vec =  AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),n_estimators=300)

#dt_stump = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1)
#dt_stump.fit(xtrain, ytrain)
#dt_stump_err = 1.0 - dt_stump.score(xtrain, ytrain)
#n_estimators = 400
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
#learning_rate = 1.

#sup_vec = AdaBoostClassifier(
#    base_estimator=dt_stump,
#    learning_rate=learning_rate,
예제 #38
0
filename = "blogData_train.csv"
train_data = pd.read_csv(filename, header=None)
#train_data = train_data.iloc[np.random.permutation(len(train_data))]
train_output = train_data[len(train_data.columns) - 1]
del train_data[len(train_data.columns) - 1]

filename = "blogData_test-2012.02.01.00_00.csv"
test_data = pd.read_csv(filename, header=None)
#test_data = test_data.iloc[np.random.permutation(len(test_data))]
test_output = test_data[len(test_data.columns) - 1]
del test_data[len(test_data.columns) - 1]

reg = LinearRegression()
rf = RandomForestRegressor()
gradBoost = GradientBoostingRegressor()
ada = AdaBoostRegressor()

#n_estimators=500

regressors = [reg, rf, gradBoost, ada]
regressor_names = [
    "Linear Regression", "Random Forests", "Gradient Boosting", "Adaboost"
]

#regressors = ada
#regressor_names = "Adaboost"

for regressor, regressor_name in zip(regressors, regressor_names):

    regressor.fit(train_data, train_output)
    predicted_values = regressor.predict(test_data)
예제 #39
0
# importing necessary libraries
import numpy as np
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

print(__doc__)
# Create the dataset
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
# dataArr, labelArr = loadDataSet("input/7.AdaBoost/horseColicTraining2.txt")
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300,
                           random_state=rng)
regr_1.fit(X, y)
regr_2.fit(X, y)
# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
예제 #40
0
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print()
print("Train data shape : ", x_train.shape)
print("Test data shape : ", x_test.shape)

# defining models

LR = LinearRegression()
SVM = SVR()
RF = RandomForestRegressor()
KNN = KNeighborsRegressor()
AB = AdaBoostRegressor()
GB = GradientBoostingRegressor()

# function for training and prediction


def train_predict(model, trainX, trainY, testX, testY):
    model.fit(trainX, trainY)
    y_pred = model.predict(testX)
    acc_test = mse(testY, y_pred)
    return acc_test


LR_acc = train_predict(LR, x_train, y_train, x_test, y_test)
print("The error score of Linear Regression is : %f" % LR_acc)
SVM_acc = train_predict(SVM, x_train, y_train, x_test, y_test)
예제 #41
0
def make_preds(start,stop,lookback):
    X = pd.read_pickle('Xdatanormalized.p')
    X = X.drop(['ebitdamultiple1','pctaffochange1','pctaffochange2','pctaffochange3','pctaffochange4'],axis=1)
    X = X[X['PMSector1']!='Hotel']
    y = pd.read_pickle('Ydatanormalized.p')
    X = X.merge(y,how='left',on=['PMSector1','date_bom'])
    X = X.dropna(subset=['trt'])

    test_X = X[X['date_bom']==stop]
    test_sectors = test_X['PMSector1']
    test_y = test_X['trt']
    test_X = test_X.drop(['trt','date_bom','PMSector1'],axis=1)

    X = X[(abs(X['trt'])<0.20)]
    y = X['trt']
    y = np.log(1+y)
    X = X.drop(['trt'],axis=1)
    X.reset_index(inplace=True)

    X_date_bom = pd.DataFrame(X['date_bom'],columns=['date_bom'])
    X=X[X.columns.difference(['date_bom','PMSector1','index'])]
    index_vals = X_date_bom

    pca = PCA()
    selection = SelectKBest()
    poly = PolynomialFeatures()
    combined_features = FeatureUnion([("pca", pca),
                                     ("univ_select", selection),
                                     ("poly",poly)])

    ada = AdaBoostRegressor()
    rfr = RandomForestRegressor()
    imputer = Imputer()

    pipeline = Pipeline(steps=[
                            ('imputer',imputer),
                            ('features',combined_features),
                            ('regressor',ada)])
    param_grid =[
    {
    'features__pca__n_components':[5],
    'features__univ_select__k':[2],
    'features__poly__degree':[1],
    'regressor':[rfr],
    'regressor__n_estimators':[400],
    'regressor__criterion':['mse'], 
    'regressor__min_samples_leaf':[1,2],
    'regressor__max_depth':[3,5,7],
    }
    ,
    {
    'features__pca__n_components':[5],
    'features__univ_select__k':[2],
    'features__poly__degree':[1],
    'regressor':[ada],
    'regressor__learning_rate':[1,0.1],
    'regressor__n_estimators':[400]
    }
    ]

    tscv = custom_timeseries_within(index_vals = X_date_bom,lookback=lookback,test=stop)

    grid_search = GridSearchCV(pipeline, param_grid=param_grid,scoring =my_scorer, cv=tscv, verbose=1)
    grid_search.fit(X,y)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    preds = grid_search.predict(test_X)
    final = pd.concat([pd.DataFrame(preds.reshape(-1)),test_y.reset_index(drop=True)],axis=1,ignore_index=True)
    final = pd.concat([final,test_sectors.reset_index(drop=True)],axis=1,ignore_index=True)
    final.columns = ['pred_trt','actual_trt','PMSector1']

    return(final)
grid_result = grid.fit(rescaledX, Y_train)

print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f, (%f) with : %r" % (mean, stdev, param))

# Ensemble

ensembles = []
ensembles.append(('ScalesAB',
                  Pipeline([('Scaler', StandardScaler()),
                            ('AB', AdaBoostRegressor())])))
ensembles.append(('ScalesGBM',
                  Pipeline([('Scaler', StandardScaler()),
                            ('GBM', GradientBoostingRegressor())])))
ensembles.append(('ScalesRF',
                  Pipeline([('Scaler', StandardScaler()),
                            ('RF', RandomForestRegressor())])))
ensembles.append(('ScalesET',
                  Pipeline([('Scaler', StandardScaler()),
                            ('ET', ExtraTreesRegressor())])))

results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
예제 #43
0
from sklearn.ensemble import RandomForestRegressor

# In[54]:

rf = RandomForestRegressor(n_estimators=200, random_state=45)
rf.fit(x_train, y_train)

# In[55]:

pred = rf.predict(x_test)
pred

# In[56]:

from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
abpred = model.predict(x_test)
print(abpred)
model.score(x_test, y_test)

# In[57]:

from sklearn.externals import joblib
joblib.dump(abpred, 'abpredsave.obj')

#

# # Completed
예제 #44
0
                       min_samples_split=2,
                       min_samples_leaf=1,
                       min_weight_fraction_leaf=0.0,
                       max_features=1,
                       max_leaf_nodes=None,
                       min_impurity_decrease=0.0,
                       min_impurity_split=None,
                       bootstrap=True,
                       oob_score=True,
                       n_jobs=None,
                       random_state=random_state,
                       verbose=0,
                       warm_start=False),
 AdaBoostRegressor(base_estimator=None,
                   n_estimators=50,
                   learning_rate=1.0,
                   loss='linear',
                   random_state=random_state),
 GradientBoostingRegressor(loss='ls',
                           learning_rate=0.1,
                           n_estimators=100,
                           subsample=1.0,
                           criterion='friedman_mse',
                           min_samples_split=2,
                           min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0,
                           max_depth=3,
                           min_impurity_decrease=0.0,
                           min_impurity_split=None,
                           init=None,
                           random_state=random_state,
예제 #45
0
파일: adaboost.py 프로젝트: elijahc/ml_v1
# Small Images
idxs = np.arange(540)[::2]

train_idxs, valid_idxs = train_test(idxs, 0.8)

images = [
    ki.img_to_array(
        ki.load_img('../data/images/%g.jpg' % id, target_size=(224, 224)))
    for id in np.arange(956)
]
images = np.array(images)

# X = train_idxs.reshape(-1,1)
X = idxs.reshape(-1, 1)
# X = images[train_idxs]

# y = l_activity[train_idxs][:,8]
y = l_activity[idxs][:, 8]

VX = images[valid_idxs]
Vy = l_activity[valid_idxs]

dor = SKDeepOracleRegressor()

abr = AdaBoostRegressor(base_estimator=dor, n_estimators=20)

scores = cross_val_score(abr, X, y)
import pdb

pdb.set_trace()
예제 #46
0
modelfit(alg6, train, test, predictors, target, IDcol, 'alg6.csv')
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False)
coef6.plot(kind='bar', title='Feature Importances')
plt.show()

alg6_accuracy = round(alg6.score(X_train,Y_train) * 100,2)
alg6_accuracy


# In[85]:


#AdaBoost Model
from sklearn.ensemble import AdaBoostRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg7= AdaBoostRegressor(n_estimators=2000, learning_rate=0.05)
modelfit(alg7, train, test, predictors, target, IDcol, 'alg7.csv')
coef7= pd.Series(alg7.feature_importances_, predictors).sort_values(ascending=False)
coef7.plot(kind='bar', title='Feature Importances')#ft

alg7_accuracy = round(alg7.score(X_train,Y_train) * 100,2)
alg7_accuracy


# In[86]:


#Gradient Boost Model
from sklearn.ensemble import GradientBoostingRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg8 = GradientBoostingRegressor(n_estimators= 50, learning_rate= 0.03, max_depth= 4)
예제 #47
0
    def book_meta_clf(self,meta_name="GradientBoostingRegressor", **params):
        """
        Book the meta clf (set all the parameters)
        """
        if meta_name == "AdaBoostRegressor":
            if "base_estimator" not in params:          params["base_estimator"]=self.book_base_clf(base_name="DecisionTreeRegressor")       
            if "n_estimators" not in params :           params["n_estimators"] = 100
            if "learning_rate" not in params :          params["learning_rate"] = 1.
            if "loss" not in params :                   params["loss"] = "square" # {'linear,square,exponential}
            if "random_state" not in params :           params["random_state"] = None
            clf = AdaBoostRegressor (**params)
            
        if meta_name == "BaggingRegressor":
            if "base_estimator" not in params:          params["base_estimator"]=self.book_base_clf()
            if "n_estimators" not in params:            params["n_estimators"] = 50
            if "max_samples" not in params:             params["max_samples"] = 1.
            if "max_features" not in params:            params["max_features"] = 1.

            if "bootstrap" not in params:               params["bootstrap"] = True
            if "bootstrap_features" not in params:      params["bootstrap_features"] = False
            if "oob_score" not in params:               params["oob_score"] = False
            if "warm_start" not in params:              params["warm_start"] = False
            if "n_jobs" not in params:                  params["n_jobs"] = 20
            if "random_state" not in params:            params["random_state"] = None
            if "verbose" not in params:                 params["verbose"] = 1
            clf = BaggingRegressor(**params)

        if meta_name == "ExtraTreesRegressor":
            if "n_estimators" not in params:            params["n_estimators"] = 200
            if "min_samples_split" not in params:       params["min_samples_split"] = 2
            if "min_samples_leaf" not in params:        params["min_samples_leaf"] = 200
            if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] =  0.02
            if "max_features" not in params:            params["max_features"] = 'auto'
            if "max_leaf_nodes" not in params:          params["max_leaf_nodes"] = None

            if "bootstrap" not in params:               params["bootstrap"] = False
            if "oob_score" not in params:               params["oob_score"] = False
            if "warm_start" not in params:              params["warm_start"] = False
            if "n_jobs" not in params:                  params["n_jobs"] = 20
            if "random_state" not in params:            params["random_state"] = None
            if "verbose" not in params:                 params["verbose"] = 1
            clf = ExtraTreesRegressor(**params)

        if meta_name == "RandomForestRegressor":
            if "n_estimators" not in params:            params["n_estimators"] = 100
            if "min_samples_split" not in params:       params["min_samples_split"] = 2
            if "min_samples_leaf" not in params:        params["min_samples_leaf"] = 200
            if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] =  0.02
            if "max_features" not in params:            params["max_features"] = 'auto'
            if "max_leaf_nodes" not in params:          params["max_leaf_nodes"] = None

            if "bootstrap" not in params:               params["bootstrap"] = True
            if "oob_score" not in params:               params["oob_score"] = False
            if "warm_start" not in params:              params["warm_start"] = False
            if "n_jobs" not in params:                  params["n_jobs"] = 20
            if "random_state" not in params:            params["random_state"] = None
            if "verbose" not in params:                 params["verbose"] = 1
            clf = RandomForestRegressor(**params)


        if meta_name == "GradientBoostingRegressor":
            if "n_estimators" not in params:            params["n_estimators"] = 200
            if "learning_rate" not in params :          params["learning_rate"] = 1.
            if "loss" not in params :                   params["loss"] = "ls" #"huber" # linear, square, exponential

            if "min_samples_split" not in params:       params["min_samples_split"] = 2
            if "min_samples_leaf" not in params:        params["min_samples_leaf"] = 50
            if "min_weight_fraction_leaf" not in params:params["min_weight_fraction_leaf"] =  0.005#0.02
            if "max_features" not in params:            params["max_features"] = 'auto'
            if "max_leaf_nodes" not in params:          params["max_leaf_nodes"] = None

            if "warm_start" not in params:              params["warm_start"] = False
            if "random_state" not in params:            params["random_state"] = None
            if "verbose" not in params:                 params["verbose"] = 1

            if "alpha" not in params :                  params["alpha"] = 0.9
            if "init" not in params :                   params["init"] = None
            if "presort" not in params :                params["presort"] = 'auto'

            clf = GradientBoostingRegressor (**params)
        
        return clf
예제 #48
0
# input data
housing_data = datasets.load_boston()
# 打乱数据,random_state控制如何打乱数据
X , y = shuffle(housing_data.data,housing_data.target,random_state=7)

# 80%作为训练样本,20%作为测试样本
num_training = int(0.8 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]

# 训练
dt_regressor = DecisionTreeRegressor(max_depth=4)
dt_regressor.fit(X_train,y_train)

# 拟合
ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=400, random_state=7)
ab_regressor.fit(X_train, y_train)

# 预测
y_pred_dt = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test,y_pred_dt)
evs = explained_variance_score(y_test,y_pred_dt)
print("\n######决策树学习效果######")
print('Mean squared error  = ',round(mse, 2))
print('Explain variance error = ',round(evs, 2))

y_pred_ad = ab_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_ad)
evs = explained_variance_score(y_test, y_pred_ad)
print('\n#####AdaBoost算法改善效果#####')
print('Mean squared error = ',round(mse, 2))
예제 #49
0
def ContributingFeaturesByYear(model, year):
    fout.write('Years : ' + str(year) + ' - ' + str(year + 4) + '\n')
    print 'Years : ', year, ' - ', year + 4
    df_filtered = df[(df['year'] >= year) & (df['year'] <= year + 4)]
    X = df_filtered[names]
    Y = df_filtered['WinPercentage']

    #Convert the target to numpy array
    arr_target = Y.as_matrix()

    #Convert the dataframe to numpy array
    arr_X = X.as_matrix()

    arr_X_train, arr_X_val, arr_target_train, arr_target_val = train_test_split(
        arr_X, arr_target, test_size=0.1, random_state=20)

    #PCA
    num_comp = []
    per_variance = []
    fout.write('Variance vs No. Of Features\n')
    fout.write('-' * 30 + '\n')
    for n in (0.99, 0.95, 0.90, 0.75, 0.65, 0.5):
        p = PCA(n_components=n).fit(arr_X_train)
        #print(p.explained_variance_)
        print n * 100, len(p.explained_variance_)
        fout.write(
            str(n * 100) + '\t' + str(len(p.explained_variance_)) + '\n')
        num_comp.append(len(p.explained_variance_))
        per_variance.append(n * 100)

    #num_comp.append(11) #the entire feature set
    #hyperparameters
    pca_val = num_comp
    alpha_val = np.logspace(-5, 5, num=11, base=2)
    c_val = np.logspace(-5, 5, num=11, base=2)  #c for SVL and SVG
    g_val = np.logspace(-5, 5, num=11, base=2)  #gamma for SVG

    if model == 'lr':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('lr', linear_model.LinearRegression())])
        gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10)
    elif model == 'rr':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('rr', linear_model.Ridge())])
        gs = GridSearchCV(pipe,
                          dict(pca__n_components=pca_val, rr__alpha=alpha_val),
                          cv=10)
    elif model == 'rf':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('rf',
                          RandomForestRegressor(n_estimators=20,
                                                max_depth=4,
                                                random_state=5))])
        gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10)
    elif model == 'svrl':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('svr_lin', SVR(kernel='linear', C=1))])
        gs = GridSearchCV(pipe,
                          dict(pca__n_components=pca_val, svr_lin__C=c_val),
                          cv=10)
    elif model == 'svrg':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('svr_gaussian', SVR(kernel='rbf', C=1, gamma=1))])
        gs = GridSearchCV(pipe,
                          dict(pca__n_components=pca_val,
                               svr_gaussian__C=c_val,
                               svr_gaussian__gamma=g_val),
                          cv=10)
    elif model == 'adaboostRF':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('adaboost',
                          AdaBoostRegressor(RandomForestRegressor(),
                                            random_state=0))])
        gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10)
    elif model == 'adaboostSVR':
        pipe = Pipeline([('pca', PCA()), ('scaled', StandardScaler()),
                         ('svr_adaboost',
                          AdaBoostRegressor(SVR(), random_state=0))])
        gs = GridSearchCV(pipe, dict(pca__n_components=pca_val), cv=10)

    gs.fit(arr_X_train, arr_target_train)

    #print gs.predict(arr_X_val)
    predictions = gs.predict(arr_X_val)
    print gs.score(arr_X_val, arr_target_val)
    print 'best_score'
    print gs.best_score_
    fout.write('Accuracy : ' + str(gs.best_score_) + '\n')
    print 'best_estimator'
    print gs.best_estimator_
    print 'best_params'
    print gs.best_params_
    fout.write('Best Number of Features : ' + str(gs.best_params_) + '\n')

    fout.write('\n\nImportant Features :\n')
    fout.write('-' * 30 + '\n')
    #Find the k best features
    k_val = list(set(num_comp))
    for j in range(0, len(k_val)):
        #if k_val[j] != 9:
        contributingFeatures = []
        skb = SelectKBest(f_regression, k=k_val[j])
        arr_X_train_reshape = skb.fit(arr_X_train, arr_target_train)
        #arr_patrons_sales_events_val_reshape = skb.transform(arr_patrons_sales_events_val)
        print 'The top ', k_val[j], ' features are: '
        fout.write('The top ' + str(k_val[j]) + ' features are: \n')
        get_features = skb.get_support(
        )  #print True or False for the features depending on whether it matters for predicting the category or not
        for i in range(0, len(get_features)):
            if get_features[i]:
                contributingFeatures.append(cols_to_keep[i])
                print i, cols_to_keep[i]
                fout.write(cols_to_keep[i] + '\n')
        fout.write('\n')
예제 #50
0
def model_training_regressor(X, Y, test_ratio, verbose_mode, name):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_ratio,
                                                        shuffle=False)

    if name == "MLP":
        model = MLPRegressor(hidden_layer_sizes=(200, 50),
                             activation='relu',
                             solver='adam',
                             alpha=0.0002,
                             batch_size='auto',
                             learning_rate='adaptive',
                             learning_rate_init=0.01,
                             power_t=0.5,
                             max_iter=10000,
                             shuffle=True,
                             random_state=None,
                             tol=0.0001,
                             verbose=verbose_mode,
                             warm_start=False,
                             momentum=0.9,
                             nesterovs_momentum=True,
                             early_stopping=False,
                             validation_fraction=0.1,
                             beta_1=0.9,
                             beta_2=0.999,
                             epsilon=1e-08,
                             n_iter_no_change=10).fit(X_train, y_train)

        return get_model_performance(model, X_test, y_test)

    elif name == "NaiveBayes":
        model = linear_model.BayesianRidge().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "SVM":
        model = svm.LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "DT":
        model = tree.DecisionTreeRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "KNN":
        model = neighbors.KNeighborsRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "RandomForest":
        model = RandomForestRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "Adaboost":
        model = AdaBoostRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    elif name == "GradientBoost":
        model = GradientBoostingRegressor().fit(X_train, y_train)
        return get_model_performance(model, X_test, y_test)

    else:
        ret = dict()
        print("no available model")
        return ret
plt.scatter(dates,y_test,c="#ADD8E6",label="Actual Closing Stock Price")
plt.title("Predicted Closing Price of Stock and Actual Closing Price of Stock using Random Forest Regressor",fontsize=20)
plt.xlabel('Time in Years',fontsize=20)
plt.ylabel('Stock Closing Price Predictions(Random Forest Regressor)',fontsize=20)
plt.legend(loc='best')
plt.show()


# In[23]:


#Test Case 1: The value of learning_rate is first set to 0.001 then the MSE obtained is 2.58
#Test Case 2: The value of learning_rate is first set to 0.001 then the MSE obtained is 2.5819
#Test Case 3: The value of learning_rate is first set to 0.000167 then the MSE obtained is 2.580
print("AdaBoost Regressor")
regr_ada = AdaBoostRegressor(random_forest,n_estimators=30,random_state=1,learning_rate=0.000167)
regr_ada.fit(df_train_norm,y_train)
regr_ada_pred = regr_ada.predict(df_test_norm)
acc_ada_mse = mean_squared_error(y_test,regr_ada_pred)
print("MSE      = "+ str(acc_ada_mse))
r2_ada = r2_score(y_test,regr_ada_pred)
print("r2_score = " + str(r2_ada))
plt.figure(figsize=(20,20))
plt.scatter(dates,regr_ada_pred,label="Predicted Closing Stock Price")
plt.scatter(dates,y_test,c="orange",label="Actual Closing Stock Price")
plt.title("Predicted Closing Price of Stock and Actual Closing Price of Stock using ADA Boost Regressor",fontsize=20)
plt.xlabel('Time in Years',fontsize=20)
plt.ylabel('Stock Closing Price Predictions(AdaBoost-RandomForest Regressor)',fontsize=20)
plt.legend(loc='best')
plt.show()
예제 #52
0
#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)
def test_regression_toy():
    """Check classification on a toy dataset."""
    clf = AdaBoostRegressor(random_state=0)
    clf = AdaBoostRegressor()
    clf.fit(X, y_regr)
    assert_array_equal(clf.predict(T), y_t_regr)
예제 #54
0
score = clf.score(X_test, Y_test)

print(score)

y_pred = clf.predict(X_test)

names = [
    "Decision Tree Regressor", "MLP Regressor", "Random Forest Regressor",
    "AdaBoost", "Bagging Regressor", "Extra Trees Regressor"
]

classifiers = [
    DecisionTreeRegressor(max_depth=5, max_features=1),
    MLPRegressor(alpha=1, max_iter=200, power_t=0.9, batch_size=50),
    RandomForestRegressor(max_depth=5, max_features=1, n_estimators=10),
    AdaBoostRegressor(n_estimators=10),
    BaggingRegressor(max_features=1, n_estimators=10, base_estimator=clf),
    ExtraTreesRegressor(max_depth=5)
]

for name, clf in zip(names, classifiers):
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    y_pred = clf.predict(X_test)
    print(name + ": " + str(score))
    mse = mean_squared_log_error(Y_test, y_pred)
    print('MSE: %.4f' % mse)
    # print(confusion_matrix(Y_test,y_pred,labels=None))
    # print(cohen_kappa_score(Y_test,y_pred, labels=None))
    # print(classification_report(Y_test,y_pred,labels=None))
예제 #55
0
    )
    # instantiating AdaBoostClassifier
    abc = AdaBoostClassifier(n_estimators=100, random_state=0)
    abc.fit(trainFeat, trainLabels)
    print("Feature importances for AdaBoostClassifier: ")
    print(abc.feature_importances_)
    # make predictions for test data
    predictions = abc.predict(testFeat)
    accuracy = accuracy_score(testLabels, predictions)
    print("Accuracy of AdaBoostClassifier: %.2f%%" % (accuracy * 100.0))
    cm = confusion_matrix(testLabels, predictions)
    # the count of true negatives is A00, false negatives is A10, true positives is A11 and false positives is A01
    print('confusion matrix:\n %s' % cm)

    # instantiating AdaBoostRegressor (similar to logistic regression)
    abr = AdaBoostRegressor(random_state=0, n_estimators=100)
    abr.fit(trainFeat, trainLabels)
    print("Feature importances for AdaBoostRegressor: ")
    print(abr.feature_importances_)
    # make predictions for test data
    predictions = abr.predict(testFeat)
    accuracy = accuracy_score(testLabels, predictions.round())
    print("Accuracy of AdaBoostRegressor: %.2f%%" % (accuracy * 100.0))
    cm = confusion_matrix(testLabels, predictions.round())
    # the count of true negatives is A00, false negatives is A10, true positives is A11 and false positives is A01
    print('confusion matrix:\n %s' % cm)

    # instantiating XGBClassifier
    xgbc = XGBClassifier()
    xgbc.fit(trainFeat, trainLabels)
    print("Feature importances for XGBClassifier: ")
lgbm_early_stopping_rounds = 100
seed = 2017

#
#############################################################################################################################################
# parameters : xgb regression ###############################################################################################################
#############################################################################################################################################

randomforest = RandomForestRegressor(n_estimators=600,
                                     max_depth=10,
                                     n_jobs=20,
                                     random_state=2017,
                                     max_features="auto",
                                     verbose=1)
adaboost = AdaBoostRegressor(n_estimators=30,
                             random_state=2017,
                             learning_rate=0.01)
gbdt = GradientBoostingRegressor(learning_rate=0.04,
                                 n_estimators=100,
                                 subsample=0.8,
                                 random_state=2017,
                                 max_depth=5,
                                 verbose=1)
extratree = ExtraTreesRegressor(n_estimators=600,
                                max_depth=8,
                                max_features="auto",
                                n_jobs=20,
                                random_state=2017,
                                verbose=1)
lr_reg = LinearRegression(n_jobs=-1)
    def predict(self, X):
        return self.m.predict(X)[0]


regressors = [
    ("k-nearest Neighbors", None, KNeighborsRegressor(2)),
    ("SVM - Linear", None, SVR(kernel="linear")),
    ("SVM - RBF", None, SVR(gamma=2, C=1)),
    ("Decision Tree", None,
     DecisionTreeRegressor(min_samples_split=1024, max_depth=20)),
    ("Random Forest", None,
     RandomForestRegressor(n_estimators=10,
                           min_samples_split=1024,
                           max_depth=20)),
    ("AdaBoost", None, AdaBoostRegressor(random_state=13370)),
    ("Naive Bayes", None, GaussianNB()),
    #("Bagging with DTRegg", ["All"], BaggingRegressor(DecisionTreeRegressor(min_samples_split=1024,
    #                                                              max_depth=20))),
    #("GP isotropic RBF", None, gp.GaussianProcessRegressor(kernel=gp.kernels.RBF())),
    #("GP anisotropic RBF", ["All"], gp.GaussianProcessRegressor(kernel=gp.kernels.RBF(length_scale=np.array([1]*n_feats)))),
    ("GP ARD", ["All"],
     gp.GaussianProcessRegressor(
         kernel=ard_kernel(sigma=1.2, length_scale=np.array([1] * n_feats)))),
    #("GP isotropic matern nu=0.5", None, gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=0.5))),
    #("GP isotropic matern nu=1.5", None, gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=1.5))),
    ("GP Isotropic Matern", None,
     gp.GaussianProcessRegressor(kernel=gp.kernels.Matern(nu=2.5))),
    # bad performance
    ("GP Dot Product", ["CFS", "CIFE", "MFCC", "All"],
     gp.GaussianProcessRegressor(kernel=gp.kernels.DotProduct())),
예제 #58
0
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression
from joblib import dump

X, y = make_regression(n_features=4,
                       n_informative=2,
                       random_state=0,
                       shuffle=False)
regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(X, y)
dump(regr, 'model.joblib')
예제 #59
0
print(r2_score(y_train,model.predict(X_train)))


model.intercept_
model.coef_
mean_squared_error(y_test,y_pred)
r2_score(y_test,y_pred)
r2_score(y_train,model.predict(X_train))




#DECISION TREE


adaboostmodel = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3),learning_rate = 4, n_estimators = 450)
adaboostmodel.fit(xencoded, y)
adaboostmodel.score(x_test, y_test)

#PLOT ENCODED VALUES
sns.pairplot(xencoded)

# In[ ]:

model4=RandomForestRegressor()
grid_params_RF={
           'n_estimators':range(50,90,10),
           'max_depth':[15,16,17,18,19,20,21]
           }
clf4=GridSearchCV(model4,grid_params_RF,cv=4,scoring='r2')
clf4.fit(x_scaled, y)
예제 #60
0
#df, attributes = preprocess.preprocess(df)
attributes = list(df.columns.values)[1:]
attributes.remove('DateTime')
attributes.remove('PredDelay')

# Initialise Regressors
regressors = {
    'gbr_reg':
    GradientBoostingRegressor(n_estimators=100,
                              learning_rate=0.1,
                              max_depth=1,
                              random_state=0,
                              loss='ls'),
    'ada_reg':
    AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                      n_estimators=300,
                      random_state=np.random.RandomState(1))
}

# Initialise Classifiers
classifiers = {
    'svm_clf': svm.SVC(),
    'bernolli_rbm_clf': BernoulliRBM(n_components=2),
    'decision_tree_clf': tree.DecisionTreeClassifier()
}

window_size = 20
window_start = 0
window_end = window_start + window_size
print "Window Size: ", window_size