Python RandomForestRegressor示例，sklearn.ensemble.forest.RandomForestRegressor Python示例

示例#1

0

显示文件

文件： ml_cms_heights.py 项目： ritviksahajpal/ML_CMS

    def __init__(self, config_file=''):
        # Parse config file
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        # machine learning specific variables
        self.classify = constants.DO_CLASSIFICATION  # Regress or classify?
        self.vars_features = constants.fixed_vars
        self.vars_target = constants.ML_TARGETS

        if self.classify:
            self.var_target = constants.ML_TARGETS
            self.task = 'classification'
            self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
        else:
            self.var_target = constants.ML_TARGETS
            self.task = 'regression'
            self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)  # SVR()

        # Get path to input
        self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl

        # Output directory is <dir>_<classification>_<2014>
        self.path_out_dir = constants.out_dir
        utils.make_dir_if_missing(self.path_out_dir)

        # Model pickle
        self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
        self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'

示例#2

0

显示文件

文件： result_predict_TS_constant_mean_weekend.py 项目： xinbingzhe/TIANCHI_ALImusic

def RandomForest(x_train,y_train,x_test,degree):    
     params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True}
     clf = RandomForestRegressor(**params)
     clf.fit(x_train, y_train)          
     y_predict = clf.predict(x_test)
     #plt.plot(x_test,y_predict,color='red')
     return y_predict

示例#3

0

显示文件

文件： RF.py 项目： emigmo/TC_CAINIAO

def RF_ST(trainFileName,testFilename):
    trainData = ld.LoadData_DATA_ST(trainFileName)
    testData = ld.LoadData_DATA_ST(testFilename)
    
    store = ['1','2','3','4','5']
    res = []
    
    for i in store:
        train_X = [];train_y = []
        context = trainData[i]
        for array in context:
            array = [float(x) for x in array[2:]]
            train_X.append((array[2:-1]))
            train_y.append(array[-1])
            
        test_X = [];items = []
        context = testData[i]
        for array in context:
            items.append((array[0],array[1]))
            array = [float(x) for x in array[2:] ]
            test_X.append((array[2:]))
            
         
        clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\
                    fit(train_X,train_y)
        pred_y = clf.predict(test_X)
         
        for i in range(len(pred_y)):
            res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)])
    return res

示例#4

0

显示文件

文件： RF.py 项目： emigmo/TC_CAINIAO

def RF_ALL(trainFileName,testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i],'all','%.4f'%max(pred_y[i],0)])
    return res

示例#5

0

显示文件

文件： gradient_boosting_old.py 项目： mityinzer/lhcb_trigger_ml

 def __init__(self, sig_weight=1., pow_sig=1., pow_bg=1., gap=1., n_estimators=10,
              criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto",
              bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0,
              min_density=None, compute_importances=None):
     RandomForestRegressor.__init__(self)
     # Everything should be set via set_params
     self.sig_weight = sig_weight
     self.pow_bg = pow_bg
     self.pow_sig = pow_sig
     self.gap = gap

示例#6

0

显示文件

文件： Trainer.py 项目： kosklain/AccelerometerCompetition

 def run(self):
     print "Reading device separations..."
     indexes = np.load("indexesTrain.npy")
     self.train = self.train.values
     print "Getting attributes..."
     trainFeatures = [self.getMainFeatures(self.train, indexes, i) for i in range(len(indexes))]
     for i in range(len(indexes)):
         (trainVect, targetVect) = self.getAttributes(trainFeatures, indexes, i)
         classifier = RandomForestRegressor(n_estimators=500, verbose=2, n_jobs=4, random_state=1)
         classifier.fit(trainVect, targetVect)
         pickle.dump(classifier, open("models/models" + str(i) + ".mod", "w"))

示例#7

0

显示文件

文件： train_artist_selectModel_mean_withoutp.py 项目： xinbingzhe/TIANCHI_ALImusic

def RandomForest(x_train,y_train,x_test,y_test):
     degree = [1,2,3,4,7]
     result = {}
     rmse_list = []
     for d in degree:
          params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True}
          clf = RandomForestRegressor(**params)
          clf.fit(x_train[:, np.newaxis], y_train)
          y_predict = clf.predict(x_test[:, np.newaxis])
          rmsevalue = rmse(y_test,y_predict)
          result[rmsevalue] = [y_predict,d]
          rmse_list.append(rmsevalue)
     rmseMin = min(rmse_list)     
     return rmsevalue,result[rmseMin]

示例#8

0

显示文件

文件： estimators.py 项目： ehauckdo/machine-learning

def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2):
    alg = RandomForestRegressor(random_state=1)
    alg.fit(train_set[predictors], train_target)
    
    #importances = alg.feature_importances_
    #print("Original ",numpy.argsort(importances))
    #indices = numpy.argsort(importances)[::-1]
    #print (" importances ",importances)
    #print (" indices ",indices)
    
    #for f in range(train_set.shape[1]-2):
    #    print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]],
    #                                    importances[indices[f]]))

    predictions = alg.predict(test_set[predictors])
    return predictions;

示例#9

0

显示文件

文件： item_set_model.py 项目： A-Malone/Quartermaster

 def __init__(self):
     super(ItemSetModel, self).__init__()
     #self.clf = DecisionTreeRegressor()
     #self.clf = Lasso(0.1)
     #self.clf = SVR(kernel='rbf')
     #self.clf = ElasticNetCV()
     self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)

示例#10

0

显示文件

文件： amazon_randomforestregressor.py 项目： sarwarbhuiyan/datascience-ga-amazon-kaggle

def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'):
    print "Loading data..."
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data[["ACTION"]])
    #X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
    X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
 
    SEED = 4
    #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
    
    
    
    clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y)

    print clf.feature_importances_
    #Try feature selection
    
    mean_auc = 0.0
    n = 10
    for i in range(n):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it
        
        # train model and make predictions
        clf.fit(X_train, y_train) 
        preds = clf.predict(X_cv)

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
    
    print "Mean AUC: %f" % (mean_auc/n)
    predictions = clf.predict_(X_test)
    #print predictions
    
    #print 'Writing predictions to %s...' % (output_file)
    create_test_submission(output_file, predictions)

    return 0

示例#11

0

显示文件

文件： tasks_common.py 项目： 151706061/MITK

    def run(self):
        # extract data from the batch
        df_train = pd.read_csv(self.input().path, header=[0, 1])

        X, y = preprocess2(df_train, snr=10.)
        # train regressor
        reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9,
                                    n_jobs=-1)
        # reg = KNeighborsRegressor(algorithm="auto")
        # reg = LinearRegression()
        # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.)
        # reg = LinearSaO2Unmixing()
        reg.fit(X, y.values)
        # reg = LinearSaO2Unmixing()
        # save regressor
        regressor_file = self.output().open('w')
        pickle.dump(reg, regressor_file)
        regressor_file.close()

示例#12

0

显示文件

文件： model.py 项目： mjstevens777/energy-portal

    def make_models(self, missing_columns):

        available_table = self.full_table.copy()
        #clear out the table
        for column in missing_columns:
            del available_table[column]
        available_features = available_table.as_matrix()

        clfs = {}
        #build a model for each missing column
        for column in missing_columns:
            labels = self.full_table.as_matrix(columns = [column])
            labels = np.reshape(labels, (len(labels))) #unnest the arrays
            clf = RandomForestRegressor(n_estimators = 100)
            clf.fit(available_features, labels, available_table['WGTP'])
            clfs[column] = clf

        return clfs

示例#13

0

显示文件

文件： trainer.py 项目： blvp/ml_tasks

def predict_per_cpu_full():
    data, target = load_data()
    data, target, labels = normalize_data(data, target)

    data = data[['C0', 'cpuFull']]
    data['target'] = target
    split_by_types = dict()

    cpu_groups = data.groupby('cpuFull')
    for name, group in cpu_groups:
        X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target'])
        split_by_types[str(name)] = {
            'train': {
                'data': X_train,
                'target': y_train
            },
            'test': {
                'data': X_test,
                'target': y_test
            }
        }

    # print split_by_types
    summ = 0.0
    for cpu, data_set in split_by_types.iteritems():
        plt.figure()
        # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0)
        reg = RandomForestRegressor(n_estimators=5)
        reg.fit(data_set['train']['data'], data_set['train']['target'])
        test_data = data_set['test']['data']
        y_pred = reg.predict(test_data)
        print mape(data_set['test']['target'], y_pred), cpu
        plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual')
        plt.scatter(test_data, y_pred, s=3, color='r', label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('mul time')
        plt.title('Category: {}'.format(cpu))
        plt.savefig('imgs/{}.png'.format(cpu))

示例#14

0

显示文件

文件： predict.py 项目： AntoinePrv/AXA-data-challenge

def train(data,val_ind,indices):
    
    max_numb = val_ind.shape[1]
    
    regs = []
    for i in range(max_numb):
        regs.append(0)
        
    for i in indices:
#        print i
#        reg = sklearn.linear_model.Lasso(max_iter=3000)
        reg = RandomForestRegressor()
#        reg=skl.tree.DecisionTreeRegressor()
#        reg = skl.linear_model.LinearRegression()
#        reg = AdaBoostRegressor()
#        print val_ind.shape
#        print val_ind[:,i]
#        print data.shape
#        print data[0]
#        print len(val_ind[:,i])
        reg.fit(data,val_ind[:,i])
        regs[i]=reg
        
    return regs

示例#15

0

显示文件

文件： append_kwh.py 项目： mjstevens777/energy-portal

y = household.as_matrix(columns = ['KWH'])
y = np.reshape(y, (len(y)))
del household['KWH']
#del household['ST']
#del household['DIVISION']
#del household['ELEP']

#if 'CDD' in household.columns:
#    del household['CDD']
#    del household['HDD']
X = household.as_matrix()
X = np.nan_to_num(X)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 10, n_jobs = 8)
clf.fit(X_train, y_train)

print(metrics.mean_squared_error(y_test, clf.predict(X_test)))
print(metrics.r2_score(y_test, clf.predict(X_test)))

predictions = clf.predict(X_test)[:50]
'''
features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)
'''
pums = pd.read_csv("../joined_weather.csv")
pums = pums.sample(1000)
pums_puma_vector = pums.as_matrix(columns = ['PUMA'])
left_matrix = pums[['PUMA', 'WGTP', 'SERIALNO']]
del pums['PUMA']

示例#16

0

显示文件

文件： python_test.py 项目： paulstey/random_forests

import pandas as pd 
from sklearn.ensemble.forest import RandomForestRegressor
import time

dset = pd.read_csv("./data/concrete_data.csv")
X = dset.iloc[:, 0:7]
y = dset.iloc[:, 8]


estimator = RandomForestRegressor(max_features = 3, n_estimators = 50, n_jobs = 1, oob_score = True)

t0 = time.time()
estimator.fit(X, y)
print(time.time() - t0)

示例#17

0

显示文件

文件： losses.py 项目： particleist/lhcb_trigger_ml

 def predict(self, X):
     return RandomForestRegressor.predict(self, X)[:, numpy.newaxis]

示例#18

0

显示文件

文件： linearRegresion3.py 项目： UnFrikienlaBolsa/AlgoritmosTrading

            s = s + '' + str(valoresDiff[i + j]) + ','
            d.append(valoresDiff[i + j])
        for j in range(0, 15):
            s = s + '' + str(valoresVol[i + j]) + ','
            d.append(valoresVol[i + j])
        #maxValores = max(valoresCopiar[longValores:i+j]) # Esta cambia porque no debemos tener los valores
        #maxValores = max(valoresCopiar[i+j:i+j+5])
        maxValores = max(valoresCopiar[i + 14 + longValoresTest:i +
                                       longValoresTest + 14 + 5])
        #s = s + str(150)
        #d.append(150)
        X_test.append(d)
        y_test.append(maxValores)

    # Entrenamos
    regr = RandomForestRegressor()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    regr2 = SVR(kernel='rbf', C=1e3, gamma=0.1)
    regr2.fit(X_train, y_train)
    y_pred2 = regr2.predict(X_test)

    regr3 = linear_model.LinearRegression()
    regr3.fit(X_train, y_train)
    y_pred3 = regr3.predict(X_test)

    # Votacion
    # Si todos OK => Se invierte
    votacion = 0
    max_pred_array = [max(y_pred), max(y_pred2), max(y_pred3)]

示例#19

0

显示文件

文件： ex10.py 项目： gabormakrai/landuseregression

def doEval(dayNight, landuse, topo, traffic_static, traffic_dynamic, weather,
           time, output):

    if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False:
        return

    groupName = "lu"
    if landuse == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "to"
    if topo == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ts"
    if traffic_static == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "td"
    if traffic_dynamic == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "we"
    if weather == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ti"
    if time == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    print("Group: " + groupName)

    columnsToUse = []

    if landuse == True:
        columnsToUse.append('leisure_area')
        columnsToUse.append('landuse_area')
    if topo == True:
        columnsToUse.append('buildings_number')
        columnsToUse.append('buildings_area')
    if traffic_static == True:
        columnsToUse.append('lane_length')
        columnsToUse.append('length')
    if traffic_dynamic == True:
        columnsToUse.append('traffic_length_car')
        columnsToUse.append('traffic_length_lgv')
        columnsToUse.append('traffic_length_hgv')
    if weather == True:
        columnsToUse.append('winddirection')
        columnsToUse.append('windspeed')
        columnsToUse.append('temperature')
        columnsToUse.append('rain')
        columnsToUse.append('pressure')
    if time == True:
        columnsToUse.append('hour')
        columnsToUse.append('day_of_week')
        columnsToUse.append('month')
        columnsToUse.append('bank_holiday')
        columnsToUse.append('race_day')

    data = {}
    columns = []
    loadData(dataFile, ['timestamp'], data, columns)

    locationValues = findOutKForValidation("location", data)

    for location in locationValues:

        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, columnsToUse, "target", dayNight)

        print("\t" + str(len(trainX)) + "," + str(len(testX)))

        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=1,
                                      random_state=42)

        model.fit(trainX, trainY)

        prediction = model.predict(testX)

        rmse = rmseEval(testY, prediction)

        print("\t" + str(rmse))

        output.write(str(dayNight) + ",")
        output.write(groupName + ",")
        output.write(str(rmse[1]) + "\n")
        output.flush()

示例#20

0

显示文件

文件： ML_Random_Forest.py 项目： Alei35/4YP

btc_ld = np.log(btc_df) - np.log(btc_df.shift(1))
btc_ld = btc_ld.dropna()

#  split the dataframe into train and test
train = btc_ld.loc[datetime.date(year=2014,month=1,day=1):datetime.date(year=2017,month=12,day=31)]
test = btc_ld.loc[datetime.date(year=2018,month=1,day=1):datetime.date(year=2018,month=1,day=31)]

# split into input and output?
trainX = np.asarray(train.drop(columns='BTC'))
trainY = np.asarray(train.BTC)

testX = np.asarray(test.drop(columns='BTC'))
testY = np.asarray(test.BTC)

# Define the RF model
RF_Model = RandomForestRegressor(n_estimators=100,
                                 max_features=1, oob_score=True)
# Fit the model
rf_fitted = RF_Model.fit(trainX,trainY)

# predict the trained data
trainY_predict = rf_fitted.predict(trainX)
trainY_predict = trainY_predict.reshape(-1,1) #reshape (transpose)
# Plot the predicted training data
train_plot_df = pd.DataFrame(trainY_predict, columns='Predicted BTC')
train_plot_df = train_plot_df.set_index(train.index)
train_plot_df['BTC'] = train.BTC
# test
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
r2_train = r2_score(train.BTC,trainY_predict)

示例#21

0

显示文件

文件： model_test.py 项目： royang2012/ML_summer_proj

import numpy as np
from sklearn.ensemble.forest import RandomForestRegressor
import stockPlot as sp

# Initiate the monthly trade object
monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013)
# Download data from Yahoo finance
monthData.monthlyDataDownload()
# Pre-processing of training an testing data
monthData.trainFeaturePre()
# Read pre-processed data from hard drive
# monthData.trainFeaturePreHd()
# Number of training months
trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan
# Initiate a random forest regressor
clf = RandomForestRegressor(n_estimators=10)
#
totalReturn = 1
predictedReturn = np.zeros(monthData.stockNum)
monthlyReturn = np.zeros(monthData.testSpan)
aggReturn = np.zeros(monthData.testSpan+1)
aggReturn[0] = 1
# rolling training and testing
for j in range(0, monthData.testSpan):
    for i in range(0, monthData.stockNum):
        clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i])
        predictedReturn[i] = clf.predict(monthData.xTest[j, :, i])
    monthlyReturn[j] = monthData.por10Returns(j, predictedReturn)
    yearReturn = totalReturn * (monthlyReturn[j]+1)
    aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j])

示例#22

0

显示文件

文件： model_comparator.py 项目： sabyasachi087/LANL-Earthquake-Prediction

x_test = x_test.drop(['segment_id'], axis=1)

# prepare models
models = []
# models.append(('LR', LogisticRegression()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('NB', GaussianNB()))
svReg = SVR(C=20.299419990722537, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.06841395086207253, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=True);

randForReg = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=100,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

models.append(('LassoReg', Lasso(alpha=0.1)))
models.append(('SVM', svReg))
models.append(('LinearReg', LinearRegression()))
models.append(('randForest', randForReg))

mas = make_scorer(mean_absolute_error, greater_is_better=False);
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)

示例#23

0

显示文件

#         for j in range(np.shape(res)[1]):
#             if res[i][j] == 100:
#                 res[i][j] = 0
#             else:
#                 res[i][j] = -0.01 * res[i][j]
#     return res

# def normalizeY(arr):
#     arr=arr/100
#     return arr

if __name__ == '__main__':

    train_x, test_x, train_y, test_y, x_data, y_data = load(train_data_path)

    rf_model = RandomForestRegressor() 
    rf_model.fit(x_data, y_data)
    with open(filename, 'wb') as file:
	    pickle.dump(rf_model, file)
    rf_train_score = rf_model.score(x_data, y_data)
    rf_test_score = rf_model.score(test_x, test_y)
    print("RF train score:",rf_train_score)
    print("RF test score:",rf_test_score)

    dt_model =  DecisionTreeRegressor() 
    dt_model.fit(x_data, y_data)
    with open(filename2, 'wb') as file:
	    pickle.dump(dt_model, file)
    dt_train_score = dt_model.score(x_data, y_data)
    dt_test_score = dt_model.score(test_x, test_y)
    print("DT train score:",dt_train_score)

示例#24

0

显示文件

def create_model():
    return RandomForestRegressor(min_samples_leaf=2,
                                 n_estimators=400,
                                 n_jobs=-1,
                                 random_state=42)

示例#25

0

显示文件

文件： main.py 项目： LeeMo2K10/GesturePredictor

build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(
    OptimalLGBMRegressor(objective="regression",
                         n_estimators=17,
                         num_iteration=11), "LGBMAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
    "LinearRegressionEnsembleAuto")
build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3),
           "RandomForestAuto",
           flat=True)
build_auto(RidgeCV(), "RidgeAuto")
build_auto(OptimalXGBRegressor(objective="reg:linear", ntree_limit=31),
           "XGBAuto")

auto_na_X, auto_na_y = load_auto("AutoNA.csv")

auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int)
auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int)
auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int)


def build_auto_na(regressor, name):
    mapper = DataFrameMapper(

示例#26

0

显示文件

文件： plot_prediction_latency.py 项目： richardh9530/QingwuWIKI-static

            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         fit_intercept=True,
                         tol=1e-4),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(n_estimators=100),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
            'complexity_label': 'support vectors',
            'complexity_computer': lambda clf: len(clf.support_vectors_)
        },
    ]
}
benchmark(configuration)

# benchmark n_features influence on prediction speed
percentile = 90

示例#27

0

显示文件

文件： randomForest.py 项目： kamikat/DidiTech2016

def train(training, k):
    model = RandomForestRegressor(n_estimators=k, n_jobs=-1)
    model.fit(training[:,:-1], training[:,-1])
    return model

示例#28

0

显示文件

x_max = np.max(np.array(X), axis=0)
outputs = ot.NumericalSample.ImportFromTextFile(base_dir + 'outputs.txt', '\t')
y = np.array(outputs).reshape((1, len(outputs)))[0]

x_min = np.min(X, axis=0)
x_max = np.max(X, axis=0)

n_train = 5000

X_train = np.array(X)[:n_train, :d]
y_train = y[:n_train]

X_test = np.array(X)[n_train:, :d]
y_true = y[n_train:]

reg = AdaBoostRegressor(RandomForestRegressor(),
                        n_estimators=50)  #, random_state=rng)
fit_train = reg.fit(X_train, y_train)

#plt.plot(y_true,fit_train.predict(X_test),'.')
#plt.plot(y_true,y_true,color="red",lw=2)

reg = AdaBoostRegressor(RandomForestRegressor(),
                        n_estimators=20)  #, random_state=rng)
fit_all = reg.fit(X, y)
s = 33

#plt.plot(y,fit_all.predict(X),'.')
#plt.plot(y,y,color="red",lw=2)

示例#29

0

显示文件

文件： model.py 项目： mjstevens777/energy-portal

        return False
    return True
    #return column in ['BDSP', 'RMSP', 'HFL', 'BLD', 'AGEP', 'NP', 'YBL', 'HINCP', 'HDD', 'CDD']

household = household[[column for column in household.columns if select_column(column)]]
X = household.as_matrix()
print(household.columns)
#X = household.as_matrix()

with open("kwh_model_features.json", "w") as f:
    json.dump(list(household.columns), f, indent = True)

print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 50, n_jobs = 8)
clf.fit(X_train, y_train)

print(y_test[:100])
print(np.sqrt(metrics.mean_squared_error(y_test, clf.predict(X_test))))
print(metrics.r2_score(y_test, clf.predict(X_test)))


features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)


with open("kwh_model.pkl", 'wb') as f:
    pickle.dump(clf, f)

示例#30

0

显示文件

文件： plot_prediction_latency.py 项目： kennethriva/myscikit-learn

            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         fit_intercept=True,
                         tol=1e-4),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
            'complexity_label': 'support vectors',
            'complexity_computer': lambda clf: len(clf.support_vectors_)
        },
    ]
}
benchmark(configuration)

# benchmark n_features influence on prediction speed
percentile = 90

示例#31

0

显示文件

文件： model.py 项目： mjstevens777/energy-portal

import numpy as np
import json

household = pd.read_csv("../household_complete_one_hot.csv")
if 'KWH' in household.columns:
    del household['KWH']

X_columns = [column for column in household.columns if column != "ELEP"]
X = household.as_matrix(columns = X_columns)
y = [label[0] for label in household.as_matrix(columns = ["ELEP"])]

#print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8)
clf.fit(X_train, y_train)


print(y_test[:100])
print(metrics.mean_squared_error(clf.predict(X_test), y_test))
print(metrics.r2_score(y_test, clf.predict(X_test)))

features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)

#fill spaces in ELEP
normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',')
print('pums shape', normalized_pums.shape)

with open("../vectorized_puma_regions/puma_list.json") as f:

示例#32

0

显示文件

文件： cveval_regression_robprior.py 项目： ysuter/brats20-survivalprediction

                   # "Huber",
                   "Linear",
                   "Passive Aggressive",
                   "SGD",
                   "Theil-Sen",
                   "RANSAC",
                   "K-Neighbors",
                   "Radius Neighbors",
                   "MLP",
                   "Decision Tree",
                   "Extra Tree",
                   "SVR"
                   ]

classifiers = [
    RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate),
    ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate),
    # GradientBoostingRegressor(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    # HistGradientBoostingClassifier(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    AdaBoostRegressor(n_estimators=200, random_state=randomstate),
    GaussianProcessRegressor(normalize_y=True),
    ARDRegression(),
    # HuberRegressor(),   # epsilon:  greater than 1.0, default 1.35
    LinearRegression(n_jobs=5),
    PassiveAggressiveRegressor(random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10
    SGDRegressor(random_state=randomstate),
    TheilSenRegressor(n_jobs=5, random_state=randomstate),
    RANSACRegressor(random_state=randomstate),
    KNeighborsRegressor(weights='distance'),  # n_neighbors: 3, 6, 9, 12, 15, 20
    RadiusNeighborsRegressor(weights='distance'),   # radius: 1, 2, 5, 10, 15
    MLPRegressor(max_iter=10000000, random_state=randomstate),

示例#33

0

显示文件

class Estimators:
    """
	Estimators class. This class
	(i) fits charging duration and energy consumption model 
	with designated regressor type (Random Forest, Extra-Random Forest, or 
	Decision Tree Regressor). 
	(ii) predicts charging duration and energy consumption with trained models

	"""
    def __init__(self,
                 filePath,
                 durationModelType="RF",
                 energyEstimatorType="RF"):

        # Load and process data
        data = self.loadData(filePath)

        # Set regression type for charging duration and energy consumption
        self.durationModelType = durationModelType
        self.energyEstimatorType = energyEstimatorType

        # Parse attributes and target columns
        attrColumns = [
            'Start Time Seconds From Midnight', 'Vehicle Battery Capacity'
        ]
        self.X = data[attrColumns]  # Vehicle Battery Capacity
        # Station Start time
        # OPTIONAL: User Id?
        # OPTIONAL: Vehicle Model Year?

        self.durationData = pd.to_numeric(data["Charging Time Secs"],
                                          downcast='float') / 60.0  # mins
        self.energyData = pd.to_numeric(data["Total Charge"],
                                        downcast='float')  # kW?

        # Fit estimators
        self.fitDurationEstimator(durationModelType)
        self.fitEnergyEstimator(energyEstimatorType)

        ### Validate the models ### -- TO BE DEVELOPED...
        # K = 5 # k-folds cross-validation
        # y = self.energyData
        # R2 = cross_val_score(self.energyEstimator, self.X, y=np.ravel(y), cv=KFold(y.size, K), n_jobs=1, scoring="accuracy").mean()
        # self.R2 = R2
        # print "The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2)

        print "Done: fitting estimators"

    def loadData(self, filePath):
        """ Load and process raw data """

        ### Load data ###
        print "loading data..."
        timeStart = time.clock()
        if "xlsx" in filePath:
            data = pd.read_excel(filePath)
        elif "csv" in filePath:
            data = pd.read_csv(filePath)
        else:
            raise ValueError("Wrong file path: " + filePath)
        print "Done: loading data. Time: " + str(time.clock() - timeStart)

        ### Process data ###
        # - convert time in string to seconds from midnight
        # - filter out rows containing nan elements
        print "processing data..."
        timeStart = time.clock()

        # Get Start Time in seconds from midnight
        startTimeSec = [
            (datetime.strptime(startTime, "%m/%d/%Y %H:%M") -
             datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace(
                 hour=0, minute=0, second=0, microsecond=0)).total_seconds()
            for startTime in data["Station Start Time"]
        ]
        print "Done: processing data. Time: " + str(time.clock() - timeStart)

        # Add new coloumn for start time in seconds from midnight
        data['Start Time Seconds From Midnight'] = startTimeSec

        # Remove indices with NaN elements from columns that you want to use in training
        nanIndices = data["Vehicle Battery Capacity"].isnull().values
        dataFiltered = data[np.invert(nanIndices)]
        # {COPY THE ABOVE TWO LINES AND CHANGE THE INDEX KEY IF YOU HAVE ANOTHER COLUMN TO FILTER OUT}

        return dataFiltered

    def fitDurationEstimator(self, modelType="RF"):
        """ Fit duration model with specified regressor type (Random forest by default) """

        print "fitting charging duration model..."

        if modelType == "RF":
            self.durationEstimator = RandomForestRegressor(random_state=0,
                                                           n_estimators=50,
                                                           max_depth=50)
            self.durationEstimator.fit(self.X, self.durationData)

        # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR}

    def fitEnergyEstimator(self, modelType="RF"):
        """ Fit energy consumption model with specified regressor type (Random forest by default) """

        print "fitting energy consumption model..."

        # Stack energy consumption data to attribute data before fitting the energy model.
        # i.e., we use a charging duration to predict an energy consumption
        X = self.X
        X["chargingDuration"] = self.durationData

        # Fit energy consumption regressor
        if modelType == "RF":
            self.energyEstimator = RandomForestRegressor(random_state=0,
                                                         n_estimators=50,
                                                         max_depth=50)
            self.energyEstimator.fit(X, self.energyData)

        # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR}

    def estimateChargingDuration(self, Xq):
        return self.durationEstimator.predict(Xq)

    def estimateEnergyConsumptions(self, Xq):
        return self.energyEstimator.predict(Xq)

    def predict(self, df):
        """
		Returns predicted charging duration and energy consumption based on the trained estimators
		
		Params
		df: dataframe with 'Start Time Seconds From Midnight', 'Vehicle Battery Capacity' columns NOTE: add more if needed
		
		"""

        # Get start time in seconds from midnight
        startTime = df["Station Start Time"][0]
        startTimeSec = (
            datetime.strptime(startTime, "%m/%d/%Y %H:%M") -
            datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace(
                hour=0, minute=0, second=0, microsecond=0)).total_seconds()

        # Build input Dataframe
        Xq = pd.DataFrame()
        Xq['Start Time Seconds From Midnight'] = [startTimeSec]
        Xq['Vehicle Battery Capacity'] = df["Vehicle Battery Capacity"]

        # Estimate charging duration
        estDuration = self.estimateChargingDuration(Xq)

        # Estimate energy consumption
        Xq["ChargingDuration"] = [estDuration]
        estEnergy = self.estimateEnergyConsumptions(Xq)

        return estDuration[0], estEnergy[0]

示例#34

0

显示文件

文件： ts_rf.py 项目： yajiayang/previous_projects

def ts_rf(n, fea, step, ntrees, njobs):
    #Random Forest Model for time series prediction
    #from sklearn import svm
    import math
    from sklearn import metrics
    import matplotlib.pyplot as plt
    from scipy.linalg import hankel
    import numpy as np
    from sklearn.ensemble.forest import RandomForestRegressor
    #input data from csv file
    #use n datapoints
    #n=1100
    #    # of features of training set
    ##        fre=50
    #    # how many steps to predict
    #step=29
    #fea=50
    path = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls.txt'
    path1 = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls_nor.txt'
    result_tem = []
    date = []
    with open(path) as f:
        next(f)
        for line in f:
            item = line.replace('\n', '').split(' ')
            result_tem.append(float(item[1]))
            date.append(item[2])
    mean = np.mean(result_tem)
    sd = np.std(result_tem)
    result = (result_tem - mean) / sd
    #form hankel matrix
    X = hankel(result[0:-fea - step + 1], result[-1 - fea:-1])
    y = result[fea + step - 1:]
    #split data into training and testing
    Xtrain = X[:n]
    ytrain = y[:n]
    Xtest = X[n:]
    ytest = y[n:]
    # random forest
    rf = RandomForestRegressor(n_estimators=ntrees, n_jobs=njobs)
    rf_pred = rf.fit(Xtrain, ytrain).predict(Xtest)
    #a = rf.transform(Xtrain,'median')

    #plot results
    LABELS = [
        x[-6:]
        for x in date[n + fea + step - 1:n + fea + step - 1 + len(ytest)]
    ]
    t = range(n, n + len(ytest))
    #    plt.show()
    #    plt.plot(t,y_lin1,'r--',t,ytest,'b^-')
    #    plt.plot(t,y_lin2,'g--',t,ytest,'b^-')
    ypred = rf_pred * sd + mean
    ytest = ytest * sd + mean
    line1, = plt.plot(t, ypred, 'r*-')
    plt.xticks(t, LABELS)
    line2, = plt.plot(t, ytest, 'b*-')
    #            plt.xlim([500,510])
    plt.legend([line1, line2], ["Predicted", "Actual"], loc=2)

    #plt.show()
    #plt.plot(xrange(n),result[0:n],'r--',t,y_lin3,'b--',t,ytest,'r--')

    y_true = ytest
    y_pred = ypred
    metrics_result = {
        'rf_MAE': metrics.mean_absolute_error(y_true, y_pred),
        'rf_MSE': metrics.mean_squared_error(y_true, y_pred),
        'rf_MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    }
    print metrics_result

示例#35

0

显示文件

文件： single_model.py 项目： modkzs/regression-predict

def train_random_forest(X, Y):
    rf = RandomForestRegressor(n_estimators=20)
    rf.fit(X, Y)
    return rf

示例#36

0

显示文件

    x_test = X[-7:]
    y_test = Y[-7:]
    ###

    RigeLinearCV = linear_model.RidgeCV(cv=10)
    rcv = RigeLinearCV.fit(x_train, y_train)
    y_pre_rcv = rcv.predict(x_oob)
    ###
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 2,
        'n_jobs': 4
    }

    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_oob)
    ###
    y_pre_diff = mean_normal_weekend_diff(Y[-21:-14], xday[-35:-14],
                                          xweekend[-35:-14], -14, -7)
    ###

    Y_test.append(y_test)
    #y_pre_diff = mean_normal_weekend_diff(Y,xday,xweekend,-21,-14)

    ###
    loss_rcv = Evaluation([y_pre_rcv], [y_oob])
    loss_rf = Evaluation([y_pre_rf], [y_oob])
    loss_diffmean = Evaluation([y_pre_diff], [y_oob])

示例#37

0

显示文件

文件： predict.py 项目： AntoinePrv/AXA-data-challenge

def _2011x2011_ (data_path):

    ##### LOADING #####
    sys.stdout.write("Loading data... ")

    # Load data from .csv file
    with open(data_path+'_X.csv') as data_file:
        reader = csv.reader(data_file)

        # Initialize lists for data and class labels
        data =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            data.append([float(x) for x in row])

    with open(data_path+'_y.csv') as labels_file:
        reader = csv.reader(labels_file)

        # Initialize lists for data and class labels
        val_ind =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            val_ind.append(row)

    sys.stdout.write("done\n")


    ##### TRAINING #####
    # splitting
    data_train, data_test, val_ind_train, val_ind_test \
        = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42)

    # Cutting date/ ASS/ number value from labels
    date_train = [x[0] for x in val_ind_train]
#    ASS_train = [x[1] for x in val_ind_train]
    val_train = [float(x[1]) for x in  val_ind_train]
    date_test = [x[0] for x in val_ind_test]
#    ASS_test = [x[1] for x in val_ind_test]
    val_test = [float(x[1]) for x in val_ind_test]

    sys.stdout.write("Training regressor... ")
    reg = RandomForestRegressor()
#    reg = skl.tree.DecisionTreeRegressor()
#    reg = skl.linear_model.LinearRegression()
    reg.fit(data_train, val_train)
    sys.stdout.write("done\n")


    ##### PREDICTION #####
    sys.stdout.write("Predicting... ")
    val_predicted = reg.predict(data_test)
    sys.stdout.write("done\n")

    ##### ERROR #####
    df = pd.DataFrame()
    df['date'] = pd.to_datetime(date_test)
#    df['ASS'] = ASS_test
    df['original'] = val_test
    df['predicted'] = val_predicted.tolist()
    df = df.set_index('date')

#    df = df.loc[df['ASS'] == 'CAT'] # one example
    
    df.info()
    
    df.plot()
    plt.show()
    
    print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))

示例#38

0

显示文件

文件： OWShapSingle.py 项目： jpoullet2000/orange3-shap

                            matplotlib=True)
        else:  # model.skl_model should be RandomForestClassifier
            features = [
                feature.name for feature in self.dataset.domain.attributes
            ]
            explainer = shap.TreeExplainer(model.skl_model)
            shap_values = explainer.shap_values(X)
            for c in range(len(shap_values)):
                shap.force_plot(explainer.expected_value[c],
                                shap_values[c][idx, :],
                                X[idx, :],
                                feature_names=features,
                                matplotlib=True)


class Task:
    """
    Task class to perform computations in parallels
    """
    def __init__(self):
        self.future = None
        self.watcher = None


if __name__ == "__main__":  # pragma: no cover
    data = Table('housing')
    rf = SKL_RF(n_estimators=10)
    rf.fit(data.X, data.Y)
    model_rf = RandomForestRegressor(rf)
    WidgetPreview(OWShapSingle).run(set_data=data, set_model=model_rf)

示例#39

0

显示文件

文件： main.py 项目： dtpryce/jpmml-sklearn

auto = auto_mapper.fit_transform(auto_df)

store_pkl(auto_mapper, "Auto.pkl")

auto_X = auto[:, 0:7]
auto_y = auto[:, 7]

print(auto_X.dtype, auto_y.dtype)

def predict_auto(regressor):
    mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"])
    return mpg

auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5)
auto_tree.fit(auto_X, auto_y)

store_pkl(auto_tree, "DecisionTreeAuto.pkl")
store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv")

auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5)
auto_forest.fit(auto_X, auto_y)

store_pkl(auto_forest, "RandomForestAuto.pkl")
store_csv(predict_auto(auto_forest), "RandomForestAuto.csv")

auto_regression = LinearRegression()
auto_regression.fit(auto_X, auto_y)

store_pkl(auto_regression, "RegressionAuto.pkl")
store_csv(predict_auto(auto_regression), "RegressionAuto.csv")

示例#40

0

显示文件

文件： ML.py 项目： armbrustlab/opedia

def RandomForest(df, queryFile):
    model = RandomForestRegressor(random_state=0, n_estimators=200, n_jobs=-1)
    MLRegression(model, df, queryFile)
    return

示例#41

0

显示文件

文件： ex27_c.py 项目： gabormakrai/landuseregression

def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]

        print("Location: " + str(location) + ", location2: " + str(location2s))

        trainPreds = defaultdict(list)
        testPreds = defaultdict(list)

        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                    location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf=9,
                                              n_estimators=59,
                                              n_jobs=-1,
                                              random_state=42)
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)
                for x in testPrediction:
                    testPreds[tag].append(x)

        t2Y = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            t2Y = t2Y + trainY2

        labelt2Y = []

        for i in range(0, len(t2Y)):
            bestModel = 0
            bestAbs = abs(t2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(t2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelt2Y.append(bestModel)

        print("#labelt2Y:" + str(len(labelt2Y)))
        tX2 = []
        testX = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, tX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
            for row in tX:
                testX.append(row)

        for pred in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i])

        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)

        model = RandomForestClassifier(random_state=42,
                                       n_estimators=100,
                                       max_depth=15)
        model.fit(reducedTrainX2, labelt2Y)

        for pred in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i])

        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)

        pred = model.predict(reducedTestX)

        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))

        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)

    rmse = rmseEval(overallPred, overallY)[1]
    return rmse

示例#42

0

显示文件

文件： gradient_boosting_old.py 项目： mityinzer/lhcb_trigger_ml

 def predict_proba(self, X):
     pred = RandomForestRegressor.predict(self, X)
     result = numpy.zeros([len(X), 2])
     result[:, 1] = special.expit(pred / 1000.)
     result[:, 0] = 1. - result[:, 1]
     return result

示例#43

0

显示文件

文件： single_model.py 项目： modkzs/regression-predict

def train_random_forest(X, Y):
    rf = RandomForestRegressor(n_estimators=20)
    rf.fit(X, Y)
    return rf

示例#44

0

显示文件

    output.close()
    
all_tags, all_features = getTagAndFeatures(['T','W', 'A', 'R', 'L', 'B'])

print(str(all_features))
    
for location in locations:
    print("location: " + str(location))
    # save down trainX, trainY, testX, testY
    trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, all_features, "target", timestampData)
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features, trainX)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features, testX)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv", ["target"], trainY)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv", ["target"], testY)
    
    for dataGroup in generateAllDataGroups():
        tag, features = getTagAndFeatures(dataGroup)
        trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, features, "target", timestampData)
        model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
        model.fit(trainX, trainY)
        trainPrediction = model.predict(trainX)
        testPrediction = model.predict(testX)
        trainRmse = str(rmseEval(trainY, trainPrediction)[1])
        testRmse = str(rmseEval(testY, testPrediction)[1])
        print("\t" + tag + ": #train: " + str(len(trainY)) + ", #test:" + str(len(testY)) + ", trainRMSE: " + trainRmse + ", testRMSE: " + testRmse)
        writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv", ["trainPred_" + tag], trainPrediction)
        writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv", ["testPred_" + tag], testPrediction)

示例#45

0

显示文件

文件： rf-kaggle_submit.py 项目： letfoolsdie/kaggle-rossman

store = store.drop("Assortment", 1).join(
    pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x))
)

train["StateHoliday"] = [mychange(x) for x in train.StateHoliday]
test["StateHoliday"] = [mychange(x) for x in test.StateHoliday]

train = train.drop("StateHoliday", 1).join(
    pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)
test = test.drop("StateHoliday", 1).join(
    pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)

train = pd.merge(train, store, on="Store")
test = pd.merge(test, store, on="Store")

repeat = 1
print("Splitting data...")
for i in range(repeat):
    features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]]
    rf = RandomForestRegressor(n_estimators=100)
    print("Starting training...")
    rf.fit(train[features].fillna(-1), train.LogSale)

    test["mypred"] = rf.predict(test[features].fillna(-1))
    test["mypred"] = np.exp(test["mypred"]) - 1

test["Sales"] = test.mypred
test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)

示例#46

0

显示文件

文件： Random_Forests.py 项目： ChiuYeeLau/KaggleRossmanSales

# In[10]:

Train = Train.fillna(0)
Test = Test.fillna(0)

# print Train.head()
# print Test.head()


# In[11]:

print 'Train Random Forests!'

from sklearn.ensemble.forest import RandomForestRegressor
RF = RandomForestRegressor(n_estimators = 500, random_state = 0)


# In[12]:

Rows = np.random.choice(Train.index.values, 400000)
Sampled_Train = Train.ix[Rows]
Sample_Train_Target = Train_Target.ix[Rows]

# RF.fit(Sampled_Train, Sample_Train_Target)
RF.fit(Train, Train_Target)


# In[ ]:

print 'Predict!'

示例#47

0

显示文件

文件： main.py 项目： dclong/jpmml-sklearn

	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")

build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto")
build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = True)
build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto", compact = True)
build_auto(LassoCV(random_state = 13), "LassoAuto")
build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", compact = True)
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", compact = True)
build_auto(RidgeCV(), "RidgeAuto")
build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", compact = True)

auto_na_X, auto_na_y = load_auto("AutoNA.csv")

auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int)
auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int)
auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int)

def build_auto_na(regressor, name):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] +
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([

示例#48

0

显示文件

文件： rf-kaggle.py 项目： letfoolsdie/kaggle-rossman

for i in range(repeat):
    newtrain, newtest = train_test_split(train, test_size = 0.2)
    newtrain = pd.DataFrame(newtrain, columns = cols)
    newtest = pd.DataFrame(newtest, columns = cols)
    
    #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day']))
    #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x)))  
    #newtest = pd.merge(newtest,store, on="Store")
    #newtest.drop(['Date'],axis = 1,inplace=True) 
    
    #assert(np.sum(newtrain.var()==0)==0)
    #
    #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) )
    features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']]
    #
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(newtrain[features].fillna(-1),newtrain.LogSale)
    print('Predicting train values...')
    newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1))
    newtrain['mypred'] = np.exp(newtrain['mypred'])-1
    train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred)
    print('train set error',train_error)
    newtest['mypred'] = rf.predict(newtest[features].fillna(-1))
    newtest['mypred'] = np.exp(newtest['mypred'])-1
    test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred)
    print('test set error',test_error)
    train_results.append(train_error)
    test_results.append(test_error)

print('mean train error', np.mean(train_results))

示例#49

0

显示文件

文件： hw4_skeleton_code.py 项目： clintval/systems_micro

# We aren't clasifying or separating samples according to their metadata, so we can just use all of the samples
x = data.abun_df[otu].values
y = [float(data.meta_df.loc[smpl, 'age']) for smpl in data.abun_df.index]  # note: want to make sure our y vector is in the same order as the x vector (i.e. that each x and y are for the same sample)
r, p_val = scipy.stats.stats.spearmanr(x,y)

# Look at scatter plot of OTU abundance vs. age to visualize the correlation 
fig, ax= plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('OTU #' + otu)
ax.set_ylabel('Age')
ax.text(0.01,0.95, r'$\rho$ = {:.2f}'.format(r), transform=ax.transAxes)

#%% 3. Build a Random Forest Regressor

## 3.1 Build the regressor
rfreg = RandomForestRegressor(n_estimators=1000, oob_score=True)

# We aren't classifying samples here, so we can just use the whole OTU table to build our regression
X = data.abun_df.values
Y = [float(data.meta_df.loc[smpl, 'BMI']) for smpl in data.abun_df.index]
rfreg = rfreg.fit(X,Y)

## 3.1.1 Look at true vs. predicted values from out of bag estimations
fig, ax = plt.subplots()
ax.scatter(Y, rfreg.oob_prediction_)
ax.set_xlabel('True')
ax.set_ylabel('Predicted')
ax.set_title('RF regression on BMI')

## 3.2 Look at the important features in the regression by inspecting their coefficient weights
feats = pd.DataFrame(index=data.abun_df.columns, columns=['importance'], data=rfreg.feature_importances_)

示例#50

0

显示文件

文件： ex28_3.py 项目： gabormakrai/landuseregression

output = open(OUTPUT_FILE, 'w')
output.write("step,rmse_tw,rmse_twa,rmse_combined,accuracy\n")

output_log = open(OUTPUT_LOG_FILE, 'w')

log(output_log, "Generating Rmse RF+TW and Rmse RF+TWA")

allPredictionTW = []
allPredictionTWA = []

for location in locations:
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, tw_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPredictionTW = model.predict(testX)
    rmse = str(rmseEval(testY, testPredictionTW)[1])
    log(output_log, "\tTW rmse: " + rmse)
    for x in testY:
        allObs.append(x)
    for x in testPredictionTW:
        allPredictionTW.append(x)
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, twa_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)

示例#51

0

显示文件

features = [
    'Store', 'SchoolHoliday', 'Promo', 'cmp msr', 'IsPromotionMonth', 'Year',
    'Month', 'Day', 'DayOfTheWeek', 'WeekOfTheYear', 'StoreType',
    'CompetitionOpenSinceMonth', 'CompetitionDistance', 'PromoOpen'
]  # Features used for prediction

feature_engineering(rossman)
feature_engineering(rossman_test)

X = rossman[features]
y = rossman.Sales  # The value we are going to predict

train_features, test_features, train_predict, test_predict = train_test_split(
    X, y)

randomForest = RandomForestRegressor(n_estimators=35)
randomForest.verbose = True
randomForest.fit(X, y)

errorValue = cross_validation.cross_val_score(randomForest,
                                              rossman[features],
                                              y,
                                              scoring='mean_squared_error',
                                              cv=3)

predicted_value = randomForest.predict(test_features)
predicted_value = np.array(predicted_value)

test_predict = np.array(test_predict)
finalResult = randomForest.predict(rossman_test)

示例#52

0

显示文件

文件： ml_cms_heights.py 项目： ritviksahajpal/ML_CMS

class MLCms:
    """

    """
    def __init__(self, config_file=''):
        # Parse config file
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        # machine learning specific variables
        self.classify = constants.DO_CLASSIFICATION  # Regress or classify?
        self.vars_features = constants.fixed_vars
        self.vars_target = constants.ML_TARGETS

        if self.classify:
            self.var_target = constants.ML_TARGETS
            self.task = 'classification'
            self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
        else:
            self.var_target = constants.ML_TARGETS
            self.task = 'regression'
            self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)  # SVR()

        # Get path to input
        self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl

        # Output directory is <dir>_<classification>_<2014>
        self.path_out_dir = constants.out_dir
        utils.make_dir_if_missing(self.path_out_dir)

        # Model pickle
        self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
        self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'

    def output_model_importance(self, gs, name_gs, num_cols):
        """

        :param gs:
        :param name_gs:
        :param num_cols:
        :return:
        """
        rows_list = []
        name_vars = []

        feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_
        importances = 100.0 * (feature_importance / feature_importance.max())

        std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        # Store feature ranking in a dataframe
        for f in range(num_cols):
            dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]}
            name_vars.append(self.vars_features[indices[f]])
            rows_list.append(dict_results)

        df_results = pd.DataFrame(rows_list)
        num_cols = 10 if len(indices) > 10 else len(indices)  # Plot upto a maximum of 10 features
        plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols],
                                   std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop,
                                   title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')',
                                   xlabel=name_vars[:num_cols], out_path=self.path_out_dir)

        df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv')

    def get_data(self):
        """

        :return:
        """
        df = pd.read_csv(self.path_inp)
        cols = [col for col in df.columns if col not in self.vars_features]
        # cols.extend(['DI', 'PI'])

        # Add information on PI and DI of soils
        # iterate over each row, get lat and lon
        # Find corresponding DI and PI

        lat_lons = zip(df['Long_round'], df['Lat_round'])
        vals_di = []
        vals_pi = []
        # for idx, (lon, lat) in enumerate(lat_lons):
        #     print idx, len(lat_lons)
        #     vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif',
        #                                            lon, lat, replace_ras=False))
        #     vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif',
        #                                      lon, lat, replace_ras=False))
        #
        # df['DI'] = vals_di
        # df['PI'] = vals_pi
        df = df[cols]

        data = df.as_matrix(columns=cols[1:])
        target = df.as_matrix(columns=[self.var_target]).ravel()
        # Get training and testing splits
        splits = train_test_split(data, target, test_size=0.2)

        return cols, splits

    def train_ml_model(self):
        """

        :return:
        """
        logger.info('#########################################################################')
        logger.info('train_ml_model')
        logger.info('#########################################################################')

        ######################################################
        # Load dataset
        ######################################################
        cols, splits = self.get_data()
        data_train, data_test, target_train, target_test = splits

        # clf =  ExtraTreesRegressor(500, n_jobs=constants.ncpu)
        # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
        # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
        # data = df_train.as_matrix(columns=cols[1:])  # convert dataframe column to matrix
        # #data = preprocessing.scale(data)
        # target = df_train.as_matrix(columns=[self.var_target]).ravel()  # convert dataframe column to matrix
        # clf.fit(data, target)
        #
        # predict_val = clf.predict(after.as_matrix(columns=cols[1:]))
        # results = compute_stats.ols(predict_val.tolist(), after_target.tolist())
        # print results.rsquared
        # import matplotlib.pyplot as plt
        # plt.scatter(after_target, predict_val)
        # plt.show()
        # pdb.set_trace()
        if not os.path.isfile(self.path_pickle_model):
            # For details in scikit workflow: See http://stackoverflow.com/questions/
            # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea
            # TODO Separate out a dataset so that even the grid search cv can be tested
            ############################
            # Select features from model
            ############################
            logger.info('Selecting important features from model')
            if self.classify:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            else:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            feat_selection = SelectFromModel(rf_feature_imp)

            pipeline = Pipeline([
                      ('fs', feat_selection),
                      ('clf', self.model),
                    ])

            #################################
            # Grid search for best parameters
            #################################
            C_range = np.logspace(-2, 10, 13)
            gamma_range = np.logspace(-9, 3, 13)
            logger.info('Tuning hyperparameters')
            param_grid = {
                'fs__threshold': ['mean', 'median'],
                'fs__estimator__max_features': ['auto', 'log2'],
                'clf__max_features': ['auto', 'log2'],
                'clf__n_estimators': [1000, 2000]
                #'clf__gamma': np.logspace(-9, 3, 13),
                #'clf__C': np.logspace(-2, 10, 13)
            }

            gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan)
            # Fir the data before getting the best parameter combination. Different data sets will have
            # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination.
            gs.fit(data_train, target_train)
            logger.info(gs.best_params_)

            data_test = pd.DataFrame(data_test, columns=cols[1:])

            # Update features that should be used in model
            selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]])
            cols = selected_features[0]
            data_test = data_test[cols]

            # Update model with the best parameters learnt in the previous step
            self.model = gs.best_estimator_.named_steps['clf']

            predict_val = self.model.predict(data_test)
            results = compute_stats.ols(predict_val.tolist(), target_test.tolist())
            print results.rsquared
            print cols
            plt.scatter(target_test, predict_val)
            plt.show()
            pdb.set_trace()
            ###################################################################
            # Output and plot importance of model features, and learning curves
            ###################################################################
            self.output_model_importance(gs, 'clf', num_cols=len(cols[1:]))

            if constants.plot_model_importance:
                train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold,
                                                                        n_jobs=constants.ncpu)
                plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve',
                                         ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir)

            # Save the model to disk
            logger.info('Saving model and features as pickle on disk')
            with open(self.path_pickle_model, 'wb') as f:
                cPickle.dump(self.model, f)
            with open(self.path_pickle_features, 'wb') as f:
                cPickle.dump(self.vars_features, f)
        else:
            # Read model from pickle on disk
            with open(self.path_pickle_model, 'rb') as f:
                logger.info('Reading model from pickle on disk')
                self.model = cPickle.load(f)

            logger.info('Reading features from pickle on disk')
            self.vars_features = pd.read_pickle(self.path_pickle_features)

        return df_cc

    def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'):
        """
        1. Does classification/regression based on already built model.
        2. Plots confusion matrix for classification tasks, scatter plot for regression
        3. Plots accuracy statistics for classification/regression
        :param df_forecast:
        :param mon_names:
        :param available_target: Is target array available?
        :param name_target: Name of target array (defaults to yield)
        :return:
        """
        data = df_forecast.as_matrix(columns=self.vars_features)  # convert dataframe column to matrix
        predicted = self.model.predict(data)

        if available_target:
            expected = df_forecast.as_matrix(columns=[name_target]).ravel()
            if not self.classify:  # REGRESSION
                # Compute stats
                results = compute_stats.ols(predicted.tolist(), expected.tolist())
                bias = compute_stats.bias(predicted, expected)
                rmse = compute_stats.rmse(predicted, expected)
                mae = compute_stats.mae(predicted, expected)

                # Plot!
                plot.plot_regression_scatter(expected, np.asarray(predicted),
                                             annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' +
                                             'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'),
                                             xlabel='Expected yield',
                                             ylabel='Predicted yield',
                                             title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])),
                                             fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop,
                                             out_path=self.path_out_dir)

                # global expected vs predicted
                if self.debug:
                    # any non-existing index will add row
                    self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names,
                                                               self.forecast_yr]

                return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias}
            else:  # CLASSIFICATION
                # Convert from crop condition class (e.g. 4) to string (e.g. exceptional)
                expected, predicted = compute_stats.remove_nans(expected, predicted)
                cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T

                # Compute and plot class probabilities
                proba_cc = self.model.predict_proba(data)
                df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values())
                plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop,
                                              out_path=self.path_out_dir)

                # Plot confusion matrix
                plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(),
                                           out_path=self.path_out_dir)

                # Normalize and plot confusion matrix
                cm_normalized = normalize(cm.astype(float), axis=1, norm='l1')
                plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', normalized=True,
                                           ticks=self.dict_cc.values(), out_path=self.path_out_dir)

                score_accuracy = accuracy_score(expected, predicted) * 100.0
                score_precision = precision_score(expected, predicted, average='weighted') * 100.0
                return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision}
        else:
            return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan,
                               'Nash-Sutcliff': np.nan}

示例#53

0

显示文件

        'random_state': 3
    }
    adb = AdaBoostRegressor(**params_adb)
    adb.fit(x_train, y_train)
    y_pre_adb = adb.predict(x_test)
    adb_pre.append(y_pre_adb)
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 2,
        'warm_start': True,
        'n_jobs': 4,
        'oob_score': True,
        'max_features': 'log2'
    }
    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_test)
    rf_pre.append(y_pre_rf)

    ###

    RigeLinearCV = linear_model.RidgeCV(cv=8,
                                        normalize=True,
                                        gcv_mode='auto',
                                        scoring='neg_mean_absolute_error')
    rcv = RigeLinearCV.fit(x_train, y_train)
    y_pre_rcv = rcv.predict(x_test)
    rcv_pre.append(y_pre_rcv)

    br = BayesianRidge(n_iter=300)

示例#54

0

显示文件

    'Soft drinks (inc. fizzy and ready to drink fruit drinks)',
    'Alcoholic drink, tobacco and narcotics', 'Alcoholic drinks',
    'Spirits and liqueurs (brought home)',
    'Wines, fortified wines (brought home)',
    'Beer, lager, ciders and perry (brought home)', 'Alcopops (brought home)',
    'Tobacco and narcotics1', 'Cigarettes',
    'Cigars, other tobacco products and narcotics'
]

predictors = feats_of_interest + food_feats1 + food_feats2
target = "admitted"

#print len(predictors)

# you can change rf to linear_model.LinearRegression() ... RandomForestRegressor() is another version
rf = RandomForestRegressor()
X = merge_df[predictors]
Y = merge_df[target]
Y = np.array(Y)
## split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
#print rf.fit(X_train, Y_train)
#print np.mean((rf.predict(X_test) - Y_test) ** 2)
# regressor for all persons as target
#print regr.fit(X_train, Y_train)
#print regr.coef_
#print np.mean((regr.predict(X_test) - Y_test) ** 2)
# regressor for male as target
#print merge_df.columns

#perform random forest with top k features, print the predictor error