Exemplo n.º 1
0
def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations):
	print "entering return best rf regressor function"
	if df.shape[0] < 10000:
		num_samples = df.shape[0]
	else:
		num_samples = int(df.shape[0]*0.7)

	print "Sample dataframe"
	#use
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples)

	# figure out a vary this some how
	"""
	param_dist = {"max_depth": [5, None],
              "max_features": sp_randint(1, df.shape[1]),
              "min_samples_split": sp_randint(1, 15),
              "min_samples_leaf": sp_randint(1, 15),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
    """
	param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]}

	clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter)
	print "starting hyperparameter search"
	clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations)

	print "sample data for fitting model"
    #train new classifier on the entire dataset
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0])

	clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"],  min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"])

	print "Fitting Random Forest Regressor"
	clf_final.fit(X,y)
	return clf_final, column_list_for_sampled
Exemplo n.º 2
0
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
    """
    Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    try:
        df = pd.read_csv(coordinationDir + element + '.csv')
    except Exception:
        print 'No data for ' + element
        return None, None, None
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    if(len(df) < 4):
        print 'Not enough data for ' + element
        return None, None, None
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df['avgCoordination'].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
Exemplo n.º 3
0
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df, rfr
Exemplo n.º 4
0
def main():
    fi = open('25-75_microcap_list.txt', 'r')
    symbols = []
    for i in fi:
        symbols.append(i.strip())
    #symbols = symbols[0:6]

    train, test = get_data(symbols, n = 30, flag = 1, blag = 12)

    train = train.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)

    train = train.dropna(axis=0)
    test = test.dropna(axis=0)

    print 'Fitting\n'
    m = RandomForestRegressor(n_estimators=250, n_jobs=1)
    m.fit(train.ix[:,6:], train.ix[:,5])
    print 'Predicting\n'
    preds = m.predict(test.ix[:,5:])

    result = test.ix[:,:4]
    result['Prediction'] = preds
    result = result.sort('Prediction', ascending=False)
    print result.head()
    result.to_csv('trade_result.csv', sep = ',', index = False)
Exemplo n.º 5
0
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'c_a', md = None):
    """
    Build a random forest-regressor model to predict some structure feature from compositional data.  Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df[targetcolumn].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
Exemplo n.º 6
0
def get_preds(features, trees=3000, depth=19):  # features is the number of latents features that I want the nmf to run on
    # Create dataframes
    df = get_nmf(k=features)
    df_full = add_yahoo_to_df(df)
    df_train = add_dummies(df_full)   # Why aren't you using df_full?

    df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is
    df_test_full = add_yahoo_to_df(df_test)
    df_test_full = add_dummies(df_test_full)

    # Create models
    X_model_class, y_model_class = get_classifier_data(df_full)
    rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth)
    rf_class.fit(X_model_class, y_model_class)
    #
    X_model_regress, y_model_regress = get_regressor_data(df_full)
    rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth)
    rf_regress.fit(X_model_regress, y_model_regress)

    # Get X and y values
    X_classify, y_classify  = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
    X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11']))

    # Run models

    classifier_preds = rf_class.predict(X_classify)
    classifier_accuracy = accuracy_score(classifier_preds, y_classify)

    regressor_preds = rf_regress.predict(X_regress)
    regressor_mse = mean_squared_error(regressor_preds, y_regress)

    # I want to return the number of features, k, along with the accuracy of the classifier
    # and the MSE of the regressor.  This will give me an idea of how well things are doing
    # based on the number of features.
    return [features, classifier_accuracy, regressor_mse]
def train_sklearn_forest(XAlltr, XAllcv, yAlltr, yAllcv, trees=20):
    errors = []
    models = []

    X = XAlltr
    Xcv = XAllcv

    print "training sklearn forset"

    for feature in range(np.shape(yAlltr)[1]):
        y = yAlltr[:, feature]
        ycv = yAllcv[:, feature]

        # train a random forest with different number of trees and plot error

        # print "training forest %d" % trees
        clf = RandomForestRegressor(n_estimators=trees, min_samples_leaf=30, max_depth=20)
        clf = RandomForestRegressor(n_estimators=trees)
        clf.fit(X, y)
        pred = clf.predict(X)
        err = pred_error(y, pred, feature)

        predcv = clf.predict(Xcv)
        errcv = pred_error(ycv, predcv, feature)

        print [trees, feature, err, errcv]

        errors.append((trees, feature, err, errcv))
        models.append(clf)

    return models, errors
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def pipeline():
        val = data[data.watch==1]
        val_a_b = val[['item_id','store_code','a','b']]
        val_y = val.label
        val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train = data[(data.watch!=1)&(data.watch!=0)]
        train_y = train.label

        
        a = list(train.a)
        b = list(train.b)
        train_weight = []
        for i in range(len(a)):
            train_weight.append(min(a[i],b[i]))
        train_weight = np.array(train_weight)

        train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train_x.fillna(train_x.median(),inplace=True)
        val_x.fillna(val_x.median(),inplace=True)
        

        model = RandomForestRegressor(n_estimators=500,max_depth=5,max_features=0.6,n_jobs=-1,random_state=1024)

	#train
	model.fit(train_x,train_y, sample_weight=train_weight)


	#predict val set
	val_a_b['pred'] = model.predict(val_x)
	val_a_b['y'] = val_y
	cost = cal_cost(val_y.values,val_a_b.pred.values,val_a_b.a.values,val_a_b.b.values)
        val_a_b.to_csv('val_{0}.csv'.format(cost[1]),index=None)
Exemplo n.º 10
0
def stepwise_best_features_per_cluster(X, Y, all_feature_metadata):
    best_features_per_cluster = {}
    for c in sorted(X['cluster'].unique()):
        seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c].ALSFRS_slope
        print "cluster:", c, "with size:", seg_X.shape, "with mean target:", seg_Y.mean(), "std:", seg_Y.std()
        seg_Y = seg_Y.fillna(seg_Y.mean())
        
        model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000)
        #model = LassoCV(cv=5)
        model = model.fit(seg_X, seg_Y)
        
        print "best we can do with all features:", np.sqrt(np.mean((model.predict(seg_X) - seg_Y) ** 2))
        print "using model:", model

        selected_fams = set()
        selected_derived = set()
        for i in range(6):
            score_per_family = {}
            t1 = time.time()
            for family, fm in all_feature_metadata.iteritems():
                if family not in selected_fams:                    
                    X_feature_fam = seg_X[list(selected_derived) + list(fm["derived_features"])]
                    model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000)
                    #model = LassoCV(cv=5)
                    model = model.fit(X_feature_fam, seg_Y)
                    score_per_family[family] = np.sqrt(np.mean((model.predict(X_feature_fam) - seg_Y) ** 2))
            t_lasso_cv = time.time() - t1
            best_fam = sorted(score_per_family.items(), key=operator.itemgetter(1))[0]
            print "adding best family:", best_fam, "time:", t_lasso_cv
            selected_fams.add(best_fam[0])
            selected_derived.update(all_feature_metadata[best_fam[0]]["derived_features"])
        best_features_per_cluster[c] = list(selected_fams)                          
    return best_features_per_cluster
Exemplo n.º 11
0
def do_regression(df, j, i, k): # input is a pandas dataframe with columns as needed below
			# output is a regression object trained to the data in the input dataframe

	
	# convert dataframe info into a vector
				
	y   = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'count' ].astype(int).values
	x_1 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'humidity' ].astype(int).values
	x_2 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'temp' ].astype(int).values
	x = zip(x_1, x_2)
				
	## Create linear regression object
	#regr = linear_model.LinearRegression()
	
	# create random forest object, should include all parameters
	regr = RandomForestRegressor(n_estimators= 100)
	#forest = DecisionTreeRegressor(max_depth = 4)
	
	## Train the model using the training sets
	
	regr.fit(x, y)



	return regr
Exemplo n.º 12
0
def fill_missing_age(df):
    #把已有的数值型特征取出来丢进Random Forest Regressor 中
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
    #print age_df

    #把乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    # print "known_age......."
    # print known_age
    # print "unknown age ........"
    # print unknown_age

    # 目标年龄
    y=known_age[:,0]

    # 特征属性值
    x=known_age[:,1:]

    #fit 到RandomForestRegressor之中
    RFR=RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1)
    RFR.fit(x,y)

    #用得到的模型进行未知年龄结果预测
    predictedAge= RFR.predict(unknown_age[:,1::])

    #用预测的结果填补原缺失数据
    df.loc[(df.Age.isnull()),'Age']=predictedAge
    return df,RFR
Exemplo n.º 13
0
def RFscore_one(x,y,id):
    folds=3
    print "RFscore " + id
    r = range(len(x))
        
    np.random.shuffle(r)
    x = x[r]
    y = y[r]
    x = (x - np.mean(x)) / np.std(x)
    y = (y - np.mean(y)) / np.std(y)
    
    x = np.array(x, ndmin=2)
    y = np.array(y, ndmin=2)
    
    x = x.T
    y = y.T
    
    rf = RandomForestRegressor(n_estimators=50, verbose=0,n_jobs=1,min_samples_split=10,compute_importances=True,random_state=1)
    fit = rf.fit(x,y)

    s = fit.score(x,y)
    
    cv = cross_validation.KFold(len(x), n_folds=folds, indices=False)
    score = 0
    median = dist(y)
    for traincv, testcv in cv:
        fit = rf.fit(x[traincv], y[traincv])
        score += fit.score(x[testcv], y[testcv])

    score /= folds
    score /= median
    return score
Exemplo n.º 14
0
def cross_val(seq, ft):
    n_folds = 10
    X, y = load_train_data(seq, ft)

    print('%d-fold cross validation. Dataset: %d samples, %d features' % (n_folds, X.shape[0], X.shape[1]))

    kf = KFold(len(y), n_folds=n_folds)
    n_est = range(30, 110, 20)

    results = []
    for n_estimators in n_est:
        scores = []
        for i, (train, test) in enumerate(kf):
            rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=mp.cpu_count())
            # the (default) score for each regression tree in the ensemble is regression
            # r2 determination coefficient (e.g., how much variance in y is explained
            # by the model)
            # https://www.khanacademy.org/math/probability/regression/regression-correlation/v/r-squared-or-coefficient-of-determination
            rf.fit(X[train], y[train])

            if False:
                y_pred = rf.predict(X[test])
                score = mean_squared_error(y_pred, y[test])
            else:
                score = rf.score(X[test], y[test])
            scores.append(score)
        scores = np.array(scores)
        print("n_estimators=%d; accuracy (R^2 score): %0.2f (+/- %0.2f)" % (n_estimators, scores.mean(), scores.std() * 2))
        results.append([seq, ft, X.shape[0], n_estimators, scores.mean(), scores.std()*2])
    return results
Exemplo n.º 15
0
 def fit(self, X, y, **kwargs):
     for key, value in kwargs.iteritems():
         if key in self.INITPARAMS.keys():
             self.INITPARAMS[key] = value
     model = RandomForestRegressor(**self.INITPARAMS)
     model.fit(X, y)
     self.model = model
Exemplo n.º 16
0
def regression(X_train, y_train, X_test, y_test):
    """
Train the regressor from Scikit-Learn.
"""
    # Random forest regressor w/ param optimization
    params = {'n_estimators':1000, 'criterion':'mse', 'max_depth':20, 'min_samples_split':1, #'estimators':400, depth:20
              'min_samples_leaf':1, 'max_features':2, 'bootstrap':True, 'oob_score':False, #'max_features':'log2'
              'n_jobs':32, 'random_state':0, 'verbose':0, 'min_density':None, 'max_leaf_nodes':None}
    if config.DEBUG: params['verbose'] = 1

    regr = RandomForestRegressor(**params)

    # Train the model using the training sets
    regr.fit(X_train, y_train)
    return regr

    # Plot the resutls
    save_semeval_data.plot_results(regr, params, X_test, y_test, feature_names)

    if config.DEBUG:
        # Show the mean squared error
        print("Residual sum of squares: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
        # Explained variance score: 1 is perfect prediction
        print('Variance score: %.2f' % regr.score(X_test, y_test))
    
    return regr
Exemplo n.º 17
0
def random_forest(X_train, y_train, y_test, X_test, num_trees=100):
	model = RandomForestRegressor(n_estimators=num_trees, oob_score=True)
	model.fit(X_train, y_train)
	prediction = model.predict(X_test)
	mean_squared_error = mse(y_test, model.predict(X_test))
	r2 = model.score(X_test, y_test)
	return (mean_squared_error, r2)
Exemplo n.º 18
0
Arquivo: model.py Projeto: kymo/kaggle
class RandomForestModel(Model):
	""" random forest model """
	def __init__(self, *argv, **args):
		super(RandomForestModel, self).__init__(*argv)
        
		self.rf = RandomForestRegressor(**args)
	
	def pretreat_feature(self):
		# pre-handle about the feature data
		pass

	def train(self):
		# train the samples
		self.rf.fit(self.x, self.y)
	
	def assess(self):
		# assess the regression model
		error = 0.0
		for j in range(len(self.test_x)):
			pre_val = self.predict(self.test_x[j])
			error += (pre_val - self.test_y[j]) ** 2
		print 'Training Error: ', error
		
    
	def predict(self, x):
		# predic the output of the x		
		return self.rf.predict(x)

	def validate(self):
		# use cross-validation to choose the best meta-parameter
		pass
Exemplo n.º 19
0
def main():
	train = pd.read_csv('../train.csv', parse_dates=['datetime'])
	train['hour'] = pd.DatetimeIndex(train['datetime']).hour
	train['weekday'] = pd.DatetimeIndex(train['datetime']).weekday
        train['isweekend'] = 0
        train.loc[(train['weekday']==5) | (train['weekday']==6), 'isweekend'] = 1
        
	test = pd.read_csv('../test.csv', parse_dates=['datetime'])
	test['hour'] = pd.DatetimeIndex(test['datetime']).hour
	test['weekday'] = pd.DatetimeIndex(test['datetime']).weekday
        test['isweekend'] = 0
        test.loc[(test['weekday']==5) | (test['weekday']==6), 'isweekend'] = 1


	results = pd.DataFrame(columns=['datetime', 'count'])	
	for hour, test_subset in test.groupby(test['hour']):
	    train_subset = train[train['hour'] == hour]
	    model = RandomForestRegressor(n_estimators=100)
	    model.fit(np.array(get_features(train_subset)), np.array(train_subset['count']))
	    predictions = model.predict(np.array(get_features(test_subset)))
	    dt = test_subset['datetime']
	    predictions = pd.Series(predictions, index=dt.index)
	    res = pd.concat([dt, predictions], axis=1)
	    res.columns=['datetime', 'count']
	    results = pd.concat([results, res])

	results['count'] = results['count'].astype('int64')
	results = results.sort('datetime')
	results.to_csv('../submissions/seventhSubmission.csv', index=False)
Exemplo n.º 20
0
Arquivo: trade.py Projeto: iswdp/trade
def main():
    fi = open('45-165caps.txt', 'r')
    symbols = []
    for i in fi:
        symbols.append(i.strip())
    #symbols = symbols[0:6]

    train, test = build_data(symbols, n = 200, flag = 1, blag = 20)

    train = train.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)

    train = train.dropna(axis=0)
    test = test.dropna(axis=0)

    #print train.head().T
    #print test.head().T

    print 'Fitting\n'
    m = RandomForestRegressor(n_estimators=500, n_jobs=10)
    m.fit(train.ix[:,5:], train.ix[:,4])
    print 'Predicting\n'
    preds = m.predict(test.ix[:,4:])

    result = test.ix[:,:4]
    result['Prediction'] = preds
    result = result.sort('Prediction', ascending=False)
    print result.head()
    result.to_csv('trade_result.csv', sep = ',', index = False)
Exemplo n.º 21
0
def randomforest(data, targets, num, fnum):
    """
    7:1205
    """
    model = RandomForestRegressor(n_estimators=num, verbose=0, oob_score=True, compute_importances=True, n_jobs=10, criterion="mse", max_features=fnum)
    model.fit(data, targets)
    return model
Exemplo n.º 22
0
def random_forest_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True):
    """
    :param train_x: train
    :param train_y: text
    :param pred_x: test set to predict
    :param review_id: takes in a review id
    :param v_curve: run the code for validation curve
    :param l_curve: run the code for learning curve
    :param get_model: run the code
    :return:the predicted values,learning curve, validation curve
    """
    rf = RandomForestRegressor(n_estimators=20,criterion='mse',max_features='auto', max_depth=10)
    if get_model:
        print "Fitting RF..."
        rf.fit(train_x, np.log(train_y+1))
        print rf.score(train_x, np.log(train_y+1))
        rf_pred = np.exp(rf.predict(pred_x))-1.0
        Votes = rf_pred[:,np.newaxis]
        Id = np.array(review_id)[:,np.newaxis]
        submission_rf = np.concatenate((Id,Votes),axis=1)
        # create submission csv for Kaggle
        np.savetxt("submission_rf.csv", submission_rf,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='')
    # plot validation and learning curves
    if v_curve:
        train_y = np.log(train_y+1.0)
        plot_validation_curve(RandomForestRegressor(), "Random Forest: Validation Curve(No: of trees)", train_x,train_y,'n_estimators',[5,10,20,50,100])
    if l_curve:
        train_y = np.log(train_y+1.0)
        plot_learning_curve(RandomForestRegressor(), "Random Forest: Learning Curve", train_x,train_y)
Exemplo n.º 23
0
def do_rf(filename):
    df, Y = create_merged_dataset(filename)
    rf = RandomForestRegressor(n_estimators=100)
    X = df.drop(['driver', 'trip'], 1)
    rf.fit(X, Y)
    probs = rf.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
Exemplo n.º 24
0
    def refit_from_scratch(self):
        """ Create a new model directly from the database, rather
         than rely on the one saved from last time."""
        # In the background fit a much larger random forest.
        self.threaded_fit = ThreadedFit()
        self.threaded_fit.signal_finished.connect(self.__init__)
        self.threaded_fit.start()

        temp_model = RandomForest(max_features="sqrt", n_jobs=-1)
        temp_enc   = CountVectorizer()
        X = []   # binary matrix the presence of tags
        Z = []   # additional numerical data
        Y = []   # target (to predict) values
        db_size = self.db.size()
        for data in self.db.yield_some(250):
            feedback = data["feedback"]
            tags     = data[  "tags"  ]
            if feedback and tags:
                Y.append(   feedback   )
                X.append(" ".join(tags))
                Z.append(self.fmt_numerical(data))

        X = temp_enc.fit_transform(X)
        X = hstack((X, coo_matrix(Z)))
        self.allX = X
        pca = PCA(min(X.shape[0], 200))
        reduced_X = pca.fit_transform(X.todense())
        temp_model.fit(reduced_X, Y)

        self.pca   = pca
        self.model = temp_model
        self.enc   = temp_enc
Exemplo n.º 25
0
    def train_with_features(self, features):
        X = self.data_folder.truncate(self.A, features)

        rfc = RandomForestRegressor()
        rfc.fit(X, self.target)

        return rfc
def round2(X, y):
    # Set parameters
    min_score = {}
    for tree in [50, 100, 200, 500]:
        for feature in ['auto', 'log2']:
            model = RandomForestRegressor(n_estimators=tree, max_features=feature)
            n = len(y)

            # Perform 5-fold cross validation
            scores = []
            kf = KFold(n, n_folds=5, shuffle=True)

            # Calculate root mean squared error for train/test for each fold
            for train_idx, test_idx in kf:
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                model.fit(X_train, y_train)
                prediction = model.predict(X_test)
                rmse = np.sqrt(mean_squared_error(y_test, prediction))
                scores.append(rmse)
            if len(min_score) == 0:
                min_score['estimator'] = tree
                min_score['max_feature'] = feature
                min_score['scores'] = scores
            else:
                if np.mean(scores) < np.mean(min_score['scores']):
                    min_score['estimator'] = tree
                    min_score['max_feature'] = feature
                    min_score['scores'] = scores

            print "Estimator:", tree
            print "Max Features:", feature
            print scores
            print np.mean(scores)
    return min_score
Exemplo n.º 27
0
    def test_rrf_vs_sklearn_reg(self):
        """Test R vs. sklearn on boston housing dataset. """
        from sklearn.datasets import load_boston
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import mean_squared_error
        from sklearn.ensemble import RandomForestRegressor

        boston = load_boston()
        X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                            test_size=0.2, random_state=13)

        n_samples, n_features = X_train.shape
        mtry = int(np.floor(0.3 * n_features))
        # do 100 trees
        r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0,
                                'mtry': mtry, 'corr.bias': False,
                                'sampsize': n_samples, 'random_state': 1234})
        r_rf.fit(X_train, y_train)
        y_pred = r_rf.predict(X_test)
        r_mse = mean_squared_error(y_test, y_pred)

        p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False,
                                     max_features=mtry, random_state=1)
        p_rf.fit(X_train, y_train)
        y_pred = p_rf.predict(X_test)
        p_mse = mean_squared_error(y_test, y_pred)
        print('%.4f vs %.4f' % (r_mse, p_mse))
        # should be roughly the same (7.6 vs. 7.2)
        np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
Exemplo n.º 28
0
	def rf_regressor(self):
		X = X.toarray() # Convert X from sparse to array
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

		model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)
		model.fit(X_train, y_train)
		return model.score(X_test, y_test).round(2)
Exemplo n.º 29
0
def train_year(train_fea, trees):
    values = train_fea['SaleYear'].values
    years = sorted(list(set(values)))
    rfs =[]
    for i in range(0, len(years)):
        print 'train model %d' % (years[i])
        rf = RandomForestRegressor(n_estimators=trees, n_jobs=1, compute_importances = True)
        y = train_fea[train_fea['SaleYear']==years[i]]
        y_fea = y.copy()
        del y_fea['SalePrice']
        rf.fit(y_fea, y["SalePrice"])
        rfs.append(rf)
    errors = None
    for i in range(1, len(years)):
        pairs = get_pairs(years, i)
        for p in pairs:
            print 'compare %d, %d' % (p[0], p[1])
            y1 = train_fea[train_fea['SaleYear']==p[0]]
            y2 = train_fea[train_fea['SaleYear']==p[1]]
            y1_fea, y2_fea = y1.copy(), y2.copy()
            del y1_fea['SalePrice']
            del y2_fea['SalePrice']
            rf = rfs[years.index(p[0])]
            y2_p = rf.predict(y2_fea)
            y2_r = np.array([v for v in y2['SalePrice']])
            error_rates = np.array(map(lambda x,y: math.fabs(x-y)/y, y2_p, y2_r))
            if type(errors)==types.NoneType:
                errors = pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i])
            else:
                errors = errors.append(pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i]))
    errors_list = []
    for i in range(1, len(years)):
        errors_list.append(errors.ix[i]['mean'].mean())
    return rfs, errors_list
Exemplo n.º 30
0
    def _fit(self, image, dot, tags, boxConstraints = []):
        img = self.normalize(image)
        if type(boxConstraints) is dict:
            boxConstraints["boxFeatures"] = self.normalize(boxConstraints["boxFeatures"])
        numFeatures = img.shape[1]
        if self._method == "RandomForest":
            from sklearn.ensemble import RandomForestRegressor as RFR
            
            regressor = RFR(n_estimators=self._ntrees,max_depth=self._maxdepth)
            regressor.fit(img, dot)

        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon)
            regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures
                                                                   ))
        #elif self._method == "svrBoxed-gurobi":
        #    regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon)
        #    regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures
        #                                                           ))
        elif self._method == "BoxedRegressionGurobi":
            regressor = RegressorC(C = self._C, epsilon = self._epsilon)
            regressor.fitgurobi(img, dot, tags, boxConstraints)
        
        elif self._method == "BoxedRegressionCplex":
            regressor = RegressorC(C = self._C, epsilon = self._epsilon)
            regressor.fitcplex(img, dot, tags, boxConstraints)

        return regressor
Exemplo n.º 31
0
### Decision Tree Regression ###
################################
tree_regressor = DecisionTreeRegressor(criterion="mse")
tree_regressor.fit(X, y)

# Predict
tree_pred = tree_regressor.predict([[6.5]])
print(
    'The predicted salary of a person at 6.5 Level with Decision Tree Regression is ',
    tree_pred)

################################
### Random Forest Regression ###
################################
forest_regressor = RandomForestRegressor(n_estimators=300, random_state=0)
forest_regressor.fit(X, y)

# Predict
forest_pred = forest_regressor.predict([[6.5]])
print(
    'The predicted salary of a person at 6.5 Level with Random Forest Regression is ',
    forest_pred)

################################
### Visualizations ###
################################

X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))

plt.scatter(X, y, color="red")
Exemplo n.º 32
0
#	cv=n_folds, n_jobs=n_jobs, verbose=verbose_grid)
#gs = gs.fit(X_new, y)
#print(gs.scorer_)
#print('best score from grid search: %.3f' % gs.best_score_)
#print(gs.best_params_)
#best = gs.best_params_
#n_estimators_gs = best['n_estimators']
#max_depth_gs = best['max_depth']
#max_features_gs = best['max_features']

# run some cross validation
print('running cross validation to determine accuracy of model...')
scores = []
splits = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
for train, test in splits.split(X):
    tree.fit(X[train], y[train])
    predicted = tree.predict(X[test])
    score = mean_absolute_error(y[test], predicted)
    scores.append(score)
print(scores)

# determine which features to write to the file
n_estimators = n_estimators_def
max_depth = max_depth_def
max_features = max_features_def
score = np.mean(scores)

print('writing the data to file...')
params = (n_folds, n_estimators, max_depth, max_features, score)
write_hyperparams(params, hyperParamFile)
n_folds, n_estimators, max_depth, max_features, \
Exemplo n.º 33
0
    print('Número de exemplos inicial:', len(histograms))
    refined_histo, refined_label = train_and_refine(histograms, pesos,
                                                    'refined')

    print('RANSAC')
    ransac = RANSACRegressor(LinearRegression(), min_samples=100)
    ransac.fit(histograms, labels)
    labels_predicted = ransac.predict(histograms[-200:])
    labels_test = labels[-200:]
    evaluate(labels_test, labels_predicted, labels_test, labels_predicted)
    plt.scatter(labels_test, labels_predicted)
    plt.show()

    print('Random Forest')
    forest = RandomForestRegressor()
    forest.fit(histograms[:800], labels[:800])
    labels_predicted = forest.predict(histograms[-200:])
    labels_test = labels[-200:]
    evaluate(labels_test, labels_predicted, labels_test, labels_predicted)
    plt.scatter(labels_test, labels_predicted)
    plt.show()

    print()
    print('RETIRANDO OUTLIERS')
    cont = len(refined_histo)
    print('número de exemplos', cont)
    train = (cont // 5 + 1) * 4
    test = (cont // 5 + 1)
    print(train, test)
    print('Quadratic - Sem outliers')
    quadratic = PolynomialFeatures(degree=2)
def validateRF():
    """
    run KFOLD method for regression 
    """

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 206
    y = 207

    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        #filter only .csv files
        tgNames = []
        for file in glob.glob("*.csv"):
            tgNames.append(file)

        tg_name = sorted(tgNames)[tg]
        print(tg_name)

        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            print("this tide gauge is already taken care of")
            return "file already analyzed!"

        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)

        metric_corr = []
        metric_rmse = []
        #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #train regression model
            rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \
                                      min_samples_leaf = 1)
            rf.fit(X_train, y_train)

            #predictions
            predictions = rf.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)

            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                print()
                metric_rmse.append(
                    np.sqrt(metrics.mean_squared_error(y_test, predictions)))

        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)

        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')

        #original size and pca size of matrix added
        new_df = pd.DataFrame(
            [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis=0)

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
Exemplo n.º 35
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 17 11:57:18 2020

@author: edith
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, [1]].values
y = dataset.iloc[:, [2]].values

t = np.array([6.5])
t = t.reshape(1, 1)
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(criterion='mse', n_estimators=10, random_state=0)
RFR.fit(X, y)
y_pred = RFR.predict(t)

X_x = np.arange(min(X), max(X), 0.01)
X_x = X_x.reshape(len(X_x), 1)
plt.scatter(X, y, color='red')
plt.scatter(t, y_pred, color='black')
plt.plot(X_x, RFR.predict(X_x), color='blue')
   mse = mean_squared_error(y_test, bagging.predict(X_test))
   estimators[i] = step_factor*(i+1)
   bagging_mse[i] = mse
    
# Estimate the Random Forest MSE over the full number
# of estimators, across a step size ("step_factor")
for i in range(0, axis_step):
    print("Random Forest Estimator: %d of %d..." % (
        step_factor*(i+1), n_estimators)
    )
    rf = RandomForestRegressor(
        n_estimators=step_factor*(i+1),
        n_jobs=n_jobs,
        random_state=random_state
    )
    rf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, rf.predict(X_test))
    estimators[i] = step_factor*(i+1)
    rf_mse[i] = mse
    
# Estimate the AdaBoost MSE over the full number
# of estimators, across a step size ("step_factor")
for i in range(0, axis_step):
    print("Boosting Estimator: %d of %d..." % (
        step_factor*(i+1), n_estimators)
    )
    boosting = AdaBoostRegressor(
        DecisionTreeRegressor(),
        n_estimators=step_factor*(i+1),
        random_state=random_state,
        learning_rate=0.01
X = sc.fit_transform(X)


import keras
from keras.utils.np_utils import to_categorical
y_binary = to_categorical(y)


'''
model = DecisionTreeRegressor(max_depth=10)
cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
'''

model = RandomForestRegressor(max_depth=15, n_estimators=25, n_jobs=8)

model.fit(X,y_binary)
feats = {}
for feature, importance in zip(df[['start_treat','doxy','ilads','buhner','cowden','liposomal','other_herbs','vitaminD','supp','oil','sugar-free','gluten-free','dairy-free','bioresonance','antimicrobial','oxygen','cannabis_oil','binaural','tobacco','alcohol','coffee','marijuana','other_stim','num_antibiotics','method_antibiotics']], model.feature_importances_):
    feats[feature] = importance #add the name/value pair 
scores = cross_val_score(model, X, y_binary, cv=3, scoring='neg_mean_absolute_error')
np.mean(scores), np.std(scores)

#adding feature importances
MostImportant = model.feature_importances_

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90)

#predicting
model.predict(X)
y_pred = model.predict(X)
Exemplo n.º 38
0
plt.bar(range(train.shape[1]), importances[index],
       color="r", yerr=std[index], align="center")
plt.xticks(range(train.shape[1]), index, rotation='vertical')
plt.xlim([-1, train.shape[1]])
ax.set_xticklabels(ordered_labels)
plt.show()


# In[8]:


# Retrain the model on best settings
best_forest = RandomForestClassifier(n_estimators=500, criterion='entropy')

best_forest.fit(train, adoptionSpeed_train)
forest_predicted = best_forest.predict(test)
print(accuracy_score(adoptionSpeed_test, forest_predicted))


# In[11]:


# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, criterion='mse')

# Train the model on training data
rf.fit(train, adoptionSpeed_train);
forest_regr = rf.predict(test)
print(mean_squared_error(adoptionSpeed_test, forest_predicted))

Exemplo n.º 39
0
                 how='left')
train.fillna(0, inplace=True)

feats = [
    'weights_sum', 'weights_mean', 'order_weight_max', 'order_weight_count',
    'order_weight_sum', 'time_weight_max', 'time_weight_sum', 'days_sum',
    'days_max', 'days_min', 'days_count', 'mean_gap', 'weights',
    'product_user_reorder_ratio', 'product_reorder_ratio',
    'product_user_ratio', 'aisle_reorder_ratio', 'dept_reorder_ratio'
]

gc.collect()

print("running random forest..........")
rf = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1)
rf.fit(train[feats], train.y)
train['y_rf'] = rf.predict(train[feats])
gc.collect()

print("running xgboost..........")
model = XGBRegressor()
model.fit(train[feats], train.y)
train['y_xgb'] = model.predict(train[feats])

gc.collect()


def getProduct(row):
    l = int(np.ceil(row['average_product_per_order']))
    return ' '.join([str(x) for x in row['product_id'][:l]])
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 21 17:29:04 2018

@author: Ashlin
"""

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np
mydata=pd.read_csv("Position_Salaries.csv")
X=mydata.iloc[:,1:2]
y=mydata.iloc[:,-1]

regressor=RandomForestRegressor(max_features='sqrt',n_estimators=300,criterion='mse',random_state=0)
regressor.fit(X,y)

plt.title("Regression")
plt.xlabel("X")
plt.ylabel("Predicted Value")
X_grid=np.arange(min(X.values),max(X.values),0.01)
X_grid=X_grid.reshape(len(X_grid),1)
plt.scatter(X,y,color='blue',label="Actual")
plt.plot(X_grid,regressor.predict(X_grid),color='red',label="RFR")
plt.legend()
plt.show()

prediction=regressor.predict(6.5)
print("The predicted value for the Salary is %0.4f" % (prediction))
Exemplo n.º 41
0
class RandomTree(object):
    def __init__(self, train, test):
        '''
        prepare datas
        :param train: training data
        :param test: testing data
        '''
        self.train = train
        self.test = test

        # prepare data
        for i in range(1, 7):
            self.train['RSSI_' + str(i)] = abs(self.train['RSCP_' + str(i)] -
                                               self.train['EcNo_' + str(i)])
            self.test['RSSI_' + str(i)] = abs(self.test['RSCP_' + str(i)] -
                                              self.test['EcNo_' + str(i)])

        self.total = self.test.append(self.train, ignore_index=True)
        self.tests = self.total.ix[:, [
            u'SRNCID', u'BestCellID', u'RSSI_1', u'RSSI_2', u'RSSI_3',
            u'RSSI_4', u'RSSI_5', u'RSSI_6'
        ]]

        self.regressor_y = self.total[['Longitude', 'Latitude']]
        self.classifier_y = self.total['GridID']

        self.regressor = RandomForestRegressor(random_state=0, n_estimators=10)
        self.classifier = RandomForestClassifier(n_estimators=10)

        self.classifier_train_x, self.classifier_test_x, self.classifier_train_y, self.classifier_test_y = \
            train_test_split(self.tests, self.classifier_y, test_size=0.2)
        self.regressor_train_x, self.regressor_test_x, self.regressor_train_y, self.regressor_test_y = \
            train_test_split(self.tests, self.regressor_y, test_size=0.2)

        # calculate center mark
        self.min_location = utm.from_latlon(self.total['Latitude'].min(),
                                            self.total['Longitude'].min())
        self.max_location = utm.from_latlon(self.total['Latitude'].max(),
                                            self.total['Longitude'].max())
        width = self.max_location[1] - self.min_location[1]
        height = self.max_location[0] - self.min_location[0]
        self.grid_x = math.ceil(width / 20)
        self.grid_y = math.ceil(height / 20)

    def findCenter(self, num):
        '''
        find the center
        :param num: grid id
        :return: center mark
        '''
        dr = math.ceil(num / self.grid_x)
        dc = num % self.grid_y
        c_x = self.min_location[1] + dc * 20 - 10
        c_y = self.min_location[0] + dr * 20 - 10
        return [c_x, c_y]

    def distance(self, lo1, la1, lo2, la2):
        '''
        calculate distance
        :param lo1: longitude1
        :param la1: latitude1
        :param lo2: longitude2
        :param la2: latitude2
        :return: distance
        '''
        dlon = lo2 - lo1
        dlat = la2 - la1
        return math.sqrt(dlon * dlon + dlat * dlat)

    def predict(self):
        '''
        train ans predict
        :return: regressor_res and classifier_res
        '''
        self.classifier_train_x, self.classifier_test_x, self.classifier_train_y, self.classifier_test_y = \
            train_test_split(self.tests, self.classifier_y, test_size=0.2)
        self.regressor_train_x, self.regressor_test_x, self.regressor_train_y, self.regressor_test_y = \
            train_test_split(self.tests, self.regressor_y, test_size=0.2)

        self.regressor.fit(self.regressor_train_x, self.regressor_train_y)
        self.classifier.fit(self.classifier_train_x, self.classifier_train_y)

        regressor_res = self.regressor.predict(self.regressor_test_x)
        classifier_res = self.classifier.predict(self.classifier_test_x)
        r_score = self.regressor.score(self.regressor_test_x,
                                       self.regressor_test_y)
        c_score = self.classifier.score(self.classifier_test_x,
                                        self.classifier_test_y)
        print 'regressor score: ' + str(r_score)
        print 'classifer score: ' + str(c_score)

        return regressor_res, classifier_res, r_score, c_score

    def compare(self):
        '''
        compare
        :return:
        '''
        regressor_com = []
        classifier_com = []
        r_score = []
        c_score = []
        for i in range(0, 10):
            regressor_res, classifier_res, r, c = self.predict()
            r_score.append(r)
            c_score.append(c)
            self.regressor_test_y.index = range(0, len(self.regressor_test_y))
            regressor_res = pd.DataFrame(regressor_res, columns=['PLO', 'PLA'])

            c_ls = []
            for i in range(len(classifier_res)):
                center = self.findCenter(classifier_res[0])
                c_ls.append(
                    utm.to_latlon(center[1], center[0], self.min_location[2],
                                  self.min_location[3]))

            c_ls = pd.DataFrame(c_ls, columns=['PLA', 'PLO'])

            r_eval = pd.concat([self.regressor_test_y, regressor_res], axis=1)
            c_eval = pd.concat([self.regressor_test_y, c_ls], axis=1)

            for i in range(0, len(r_eval)):
                r_dis = self.distance(r_eval.loc[i, 'Longitude'],
                                      r_eval.loc[i, 'Latitude'],
                                      r_eval.loc[i, 'PLO'], r_eval.loc[i,
                                                                       'PLA'])
                c_dis = self.distance(c_eval.loc[i, 'Longitude'],
                                      c_eval.loc[i, 'Latitude'],
                                      c_eval.loc[i, 'PLO'], c_eval.loc[i,
                                                                       'PLA'])
                regressor_com.append(r_dis)
                classifier_com.append(c_dis)

        # r_score = np.average(r_score)
        # c_score = np.average(c_score)

        plt.plot(r_score, color='red')
        plt.xlabel('time')
        plt.ylabel('average')
        plt.show()

        plt.plot(c_score, color='blue')
        plt.xlabel('time')
        plt.ylabel('average')
        plt.show()

        regressor_com.sort()
        classifier_com.sort()

        plt.plot(regressor_com, color='red')
        plt.xlabel('index')
        plt.ylabel('distance')
        plt.show()

        plt.plot(classifier_com, color='blue')
        plt.xlabel('index')
        plt.ylabel('distance')
        plt.show()
Exemplo n.º 42
0
class Solution(object):
    def __init__(self):

        self.dataframe_all = su.load()

    def setup_training(self):
        ''' Fits a regression model to the training data. '''

        split = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.2,
                                       random_state=42)
        for train_index, test_index in split.split(
                self.dataframe_all, self.dataframe_all["income_cat"]):
            self.strat_train_set = self.dataframe_all.loc[train_index]
            self.strat_test_set = self.dataframe_all.loc[test_index]

        self.dataframe_all = self.strat_train_set.drop(
            "median_house_value", axis=1)  # drop labels for training set
        self.feature_labels = self.strat_train_set["median_house_value"].copy()

    def preprocess(self):
        self.prepared_data = prep.process_pipeline(self.dataframe_all)

    def predict_values(self):
        ''' Makes predictions using a fit classifier based on F1 score. '''

        self.forest_reg = RandomForestRegressor(random_state=42)
        self.forest_reg.fit(self.prepared_data, self.feature_labels)
        price_predictions = self.forest_reg.predict(self.prepared_data)
        forest_mse = mean_squared_error(self.feature_labels, price_predictions)
        forest_rmse = np.sqrt(forest_mse)
        print(" Forest RMSE ", forest_rmse)
        forest_scores = cross_val_score(self.forest_reg,
                                        self.prepared_data,
                                        self.feature_labels,
                                        scoring="neg_mean_squared_error",
                                        cv=10)
        forest_rmse_scores = np.sqrt(-forest_scores)
        print("Scores:", forest_rmse_scores)
        print("Mean:", forest_rmse_scores.mean())
        print("Standard deviation:", forest_rmse_scores.std())

    def grid_search(self):
        param_grid = [
            # try 12 (3×4) combinations of hyperparameters
            {
                'n_estimators': [3, 10, 30],
                'max_features': [2, 4, 6, 8]
            },
            # then try 6 (2×3) combinations with bootstrap set as False
            {
                'bootstrap': [False],
                'n_estimators': [3, 10],
                'max_features': [2, 3, 4]
            },
        ]

        forest_reg = self.forest_reg
        # train across 5 folds, that's a total of (12+6)*5=90 rounds of training
        grid_search = GridSearchCV(forest_reg,
                                   param_grid,
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   return_train_score=True)
        grid_search.fit(self.prepared_data, self.feature_labels)

        return grid_search.best_estimator_

    def test(self, final_model):
        self.final_model = final_model
        X_test = self.strat_test_set.drop("median_house_value", axis=1)
        y_test = self.strat_test_set["median_house_value"].copy()

        X_test_prepared = prep.process_pipeline(X_test)
        final_predictions = final_model.predict(X_test_prepared)

        some_data = X_test.iloc[:10]
        some_labels = y_test[:10]
        some_data_prepared = prep.process_pipeline(some_data)
        print("Predictions:", final_model.predict(some_data_prepared))
        print("Labels:", list(some_labels))

        final_mse = mean_squared_error(y_test, final_predictions)
        final_rmse = np.sqrt(final_mse)
        print(" Final RMSE ", final_rmse)
Exemplo n.º 43
0
# to fit the model with and without some variables

# Reordering our dataset
X = X[[
    'Absolute Magnitude', 'Est Dia in M(average)',
    'Relative Velocity km per sec', 'Miss Dist.(kilometers)',
    'Minimum Orbit Intersection', 'Eccentricity', 'Semi Major Axis',
    'Inclination', 'Asc Node Longitude', 'Perihelion Distance',
    'Perihelion Arg', 'Perihelion Time', 'Mean Anomaly'
]]

# Using Random Forest Feature importance to select the most important features
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=1, max_depth=10)

model.fit(X, y)

features = X.columns
importances = model.feature_importances_
indices = np.argsort(importances)[-1:-4:-1]

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

# Well we can clearly see in the feature importance graph that there are just three
# variables which contributes by more than 96% to the target then all the other
# variables, the other variables contribute with less than 1%, so we will just
# keep only those three variables:
Exemplo n.º 44
0
age = 43
f'Hello {name.upper()}, you are {age}'

# 1.1 we can even store PATH of a directory and use {PATH} to read files
# 1.2 notes on read_csv: use low_memory and parse_date every time.
df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False, parse_date=['saledate'])

# 2. a trick of display long columns lists
df_raw.tail().transpose()

# 3. Fix Stationary problem: log transformation:
df_raw.SalePrice = np.log(df_raw.SalePrice)

# 4. a fast way to initiate a ML model 
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalsPrice)

# 5. Fastai tools
# 5.1 strip datetimes and assing them to different columns
add_datepart(df_raw, 'saledate')

# 6. CleanData
# 6.1 Access date parts (You can access any part after the dt.)
df_raw.saledate.dt.
# 6.2 Access catgorical parts (here it returns all categoriies)
df_raw.UsageBand.cat.categories
df
# 6.2.1 We can reorder the categories (to make the order of the corersponiding numerics more meaningful)
df_raw.UsageBand.cat.set_categoreis(['High, 'Medium', 'Low'], ordered=True, inplace=True)

Exemplo n.º 45
0
Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test)

print(plr.score(X_test,Y_test))

# Already good. Our model predicts well the cost of treatment of patients. I think we could limit ourselves to creating two or three polynomial features, but the data set is so small, so we went the easy way.
# And finally try RandomForestRegressor. I've never used this algorithm in regression analysis.

# In[ ]:


forest = RandomForestRegressor(n_estimators = 100,
                              criterion = 'mse',
                              random_state = 1,
                              n_jobs = -1)
forest.fit(x_train,y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)

print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))

# In[ ]:


pl.figure(figsize=(10,6))
Exemplo n.º 46
0
class RandomForest:
    def __init__(self, criterion, max_features,
                 max_depth, min_samples_split, min_samples_leaf,
                 min_weight_fraction_leaf, bootstrap, max_leaf_nodes,
                 min_impurity_decrease, random_state=None, n_jobs=1):
        self.n_estimators = self.get_max_iter()
        self.criterion = criterion
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.bootstrap = bootstrap
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.estimator = None
        self.time_limit = None

    @staticmethod
    def get_max_iter():
        return 100

    def get_current_iter(self):
        return self.estimator.n_estimators

    def fit(self, X, y, sample_weight=None):
        from sklearn.ensemble import RandomForestRegressor

        if self.estimator is None:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)

            if self.max_features not in ("sqrt", "log2", "auto"):
                max_features = int(X.shape[1] ** float(self.max_features))
            else:
                max_features = self.max_features

            self.bootstrap = check_for_bool(self.bootstrap)

            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            self.min_impurity_decrease = float(self.min_impurity_decrease)

            # initial fit of only increment trees
            self.estimator = RandomForestRegressor(
                n_estimators=self.get_max_iter(),
                criterion=self.criterion,
                max_features=max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                bootstrap=self.bootstrap,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                warm_start=True)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)
Exemplo n.º 47
0
rmse = metrics.mean_squared_error(y_train, y_pred, squared=False)
print("Train Lasso r2-score is {}".format(r2))
print("Train Lasso RMSE is {}".format(rmse))

# lasso feature selection
sel = SelectFromModel(lasso)
sel.fit(X_train_chi_sel, y_train)
selected_feat = X_train_chi_sel[:, sel.get_support()]

X_train_selected = sel.transform(X_train_chi_sel)
X_test_selected = sel.transform(test_chi_sel[:, :-1])
print("datasets trasformed to {} features...".format(X_train_selected.shape[1]))

# Random Forest
clf = RandomForestRegressor(random_state=0)
clf.fit(X_train_selected, y_train)

y_pred = clf.predict(X_test_selected)
r2 = metrics.r2_score(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)

print("\nRANDOM FOREST")
print('selected features by lasso: {}'.format(X_train_selected.shape[1]))
print("Test Random forest r2-score is {}".format(r2))
print("Test Random forest RMSE is {}".format(rmse))

y_pred = clf.predict(X_train_selected)
r2 = metrics.r2_score(y_train, y_pred)
rmse = metrics.mean_squared_error(y_train, y_pred, squared=False)

print("Train Random forest r2-score is {}".format(r2))
param1 = {'n_estimators': [100, 500, 50]}
model1 = GridSearchCV(estimator=rf, param_grid=param1, scoring='neg_mean_squared_error', cv=5)
model1.fit(windspeed_trainX, windspeed_trainY)
model1.best = model1.best_params_
print('model1 best param:', model1.best_params_)
print('model1 best score:', model1.best_score_)
param2 = {'max_depth': [5, 10, 15], 'min_samples_split': [10, 5, 2]}
model2 = GridSearchCV(estimator=RandomForestRegressor(random_state=10, n_estimators=450), param_grid=param2,
                      scoring='neg_mean_squared_error', cv=5)
model2.fit(windspeed_trainX, windspeed_trainY)
model2.best = model2.best_params_
print('model2 best param:', model2.best_params_)
print('model2 best score:', model2.best_score_)
# 选择最优参数进行预测
speed_model = RandomForestRegressor(n_estimators=450, random_state=10, max_depth=10, min_samples_split=5)
speed_model.fit(windspeed_trainX, windspeed_trainY)
windspeed_testY = speed_model.predict(windspeed_testX)
data.loc[data.windspeed == 0, 'windspeed'] = windspeed_testY

# 填充后的数据特征分布
fig, axes = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
sn.distplot(data['temp'], ax=axes[0, 0])
sn.distplot(data['atemp'], ax=axes[0, 1])
sn.distplot(data['humidity'], ax=axes[1, 0])
sn.distplot(data['windspeed'], ax=axes[1, 1])
axes[0, 0].set(xlabel='temp', title='气温分布')
axes[0, 1].set(xlabel='atemp', title='体感温度分布')
axes[1, 0].set(xlabel='humidity', title='湿度分布')
axes[1, 1].set(xlabel='windspeed', title='风速分布')
plt.savefig('修正后分布分析.png')
Exemplo n.º 49
0
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
locs, labels = plt.xticks()
plt.title("Confusion matrix (Decision tree)", fontsize=15)

# Random forest
heart_features = [
    'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS',
    'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen',
    'Major_Vessels', 'Thalassemia'
]
features = heart_data[heart_features]
train_features, val_features, train_target, val_target = train_test_split(
    features, target, random_state=0)
forest_model = RandomForestRegressor(n_estimators=100, random_state=0)
forest_model.fit(train_features, train_target)
melb_preds = forest_model.predict(val_features)
print('MAE_random_forrest:')
MAE_RF = mean_absolute_error(melb_preds, melb_preds)
print(MAE_RF)

# random forest - cross validation
heart_features = [
    'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS',
    'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen',
    'Major_Vessels', 'Thalassemia'
]
features = heart_data[heart_features]
my_pipeline = Pipeline(
    steps=[('preprocessor', SimpleImputer()
            ), ('model',
Exemplo n.º 50
0
png = graph.create_png()
graph.write_png("decision_tree.png")
im = Image.open('decision_tree.png')
#im.show()

# 训练集和测试集区分,构造决策树并预测精度
data_train, data_test, target_train, target_test = \
    train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42)
dtr = tree.DecisionTreeRegressor(random_state=42)
dtr.fit(data_train, target_train)
accuracy_dtr = dtr.score(data_test, target_test)
print('决策树的精度为:', accuracy_dtr)

#随机森林
rfr = RandomForestRegressor(random_state=42)
rfr.fit(data_train, target_train)
accuracy_rfr = rfr.score(data_test, target_test)
print('随机森林的精度为:', accuracy_rfr)

#选择合适的参数
tree_param_grid = {
    'min_samples_split': list((3, 6)),
    'n_estimators': list((50, 100))
}
#'min_samples_split': list((3,6,9)):以min_samples_split为标准,3,6,9哪个参数效果好?cv=5交叉验证5次
grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_param_grid, cv=5)
grid.fit(data_train, target_train)
cv_results_ = grid.cv_results_
best_params_ = grid.best_params_
best_score_ = grid.best_score_
print(cv_results_)
Exemplo n.º 51
0
def Random_forest_regress(X_train,X_test,y_train,y_test,CARE_df,n_estimators,name):
    regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=43,min_samples_leaf= 2, max_features="sqrt", max_depth= 12, bootstrap= True)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    logit_roc_auc = roc_auc_score(np.where(y_test > 0, 1, 0), y_pred)


    plt.figure()
    plt.plot(y_test,y_pred,'o')
    plt.xlabel("Test Set")
    plt.ylabel("Prdiction- binary")
    plt.title(f'{name}- results')
    plt.show()
    # plt.savefig(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}-results-RandomForest.jpeg')

    logit_roc_auc = roc_auc_score(np.where(y_test > 0, 1, 0), y_pred)
    fpr, tpr, thresholds = roc_curve(np.where(y_test > 0, 1, 0), y_pred)

    # # export to excel
    # df = pd.DataFrame(data={'fpr': fpr, 'tpr': tpr, 'threshold':thresholds})
    # df.to_excel(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}_randomForestValues.xlsx',index=False)

    CARE_predict=regressor.predict(CARE_df)

    accuracy=[]
    specificity=[]
    sensitivity=[]
    pred_yes=[]
    percent_yes=[]
    for threshold in thresholds:
        tn, fp, fn, tp = confusion_matrix(np.where(y_test > 0, 1, 0), np.where(y_pred > threshold, 1, 0).reshape(-1)).ravel()
        accuracy_score=(tn+tp)/(tn+fp+fn+tp)
        specificity_score = tn / (tn + fp)
        sensitivity_score=tp/(tp+fn)
        accuracy.append(accuracy_score)
        specificity.append(specificity_score)
        sensitivity.append(sensitivity_score)
        pred_yes.append(sum(np.where(CARE_predict > threshold, 1, 0)))
        percent_yes.append((sum(np.where(CARE_predict > threshold, 1, 0)))/len(CARE_predict))

    df = pd.DataFrame(data={'thresholds': thresholds, 'specificity': specificity, 'sensitivity': sensitivity,'pred_yes':pred_yes,'percent_yes':percent_yes})
    df.to_excel(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}_CARE_values_forest_1.xlsx',index=False)


    index_80=np.argwhere(np.array(sensitivity)>0.8)[0][0]
    index_65=np.argwhere(np.array(sensitivity)>0.65)[0][0]

    plt.figure()
    plt.plot(fpr, tpr, label='AUC = %0.2f' % logit_roc_auc)
    plt.plot(sensitivity,specificity, label="recall vs. specificity")
    plt.plot(sensitivity[np.argmax(accuracy)],specificity[np.argmax(accuracy)],'o')
    plt.text(sensitivity[np.argmax(accuracy)]-0.1, specificity[np.argmax(accuracy)]-0.1,f'Threshold for max\naccuracy={round(thresholds[np.argmax(accuracy)],2)}')
    plt.plot(sensitivity[index_80], specificity[index_80],'o')
    plt.text(sensitivity[index_80]-0.1, specificity[index_80]-0.1,f'recall={round(sensitivity[index_80],2)}, spec={round(specificity[index_80],2)}\n Threshold={round(thresholds[index_80],2)}')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Random Forest- {name}\n #of trees={n_estimators},Max accuracy={round(max(accuracy),2)}')
    plt.legend(loc="lower right")
    plt.show()
Exemplo n.º 52
0
def routes():
    chosenroute = request.form.get('chosenroute')

    chosenorigin = request.form.get('chosenorigin')
    chosendestination = request.form.get('chosendestination')
    chosenday = request.form.get('chosenday')
    chosentime = request.form.get('chosentime')
    chosentemp = request.form.get('chosentemp')
    chosenhumid = request.form.get('chosenhumid')
    chosenpres = request.form.get('chosenpres')
    #run the prediction model
    dataframe = pd.read_csv('cleangps.csv')
    array = dataframe.values
    X = array[:, 0:7]
    Y = array[:, 7]
    test_size = 0.33
    seed = 7
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X, Y, test_size=test_size, random_state=seed)
    # Fit the model on 33%
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)
    # save the model to disk
    filename = 'finalized_model.sav'
    pickle.dump(model, open(filename, 'wb'))
    #calculating the average time between two adjacent stops
    chosend = float(re.search(r'\d+', chosenday).group())
    chosent = float(re.search(r'\d+', chosentime).group())
    chosenro = float(re.search(r'\d+', chosenroute).group())
    chosenro1 = str(re.search(r'\d+', chosenroute).group())
    #chosenro1=chosenroute.split(":")
    #value= str(chosenro1[0])
    chosenorig = float(re.search(r'\d+', chosenorigin).group())
    chosendest = float(re.search(r'\d+', chosendestination).group())
    data = []
    for i in range(0, len(X)):
        if X[i][0] == chosenro and X[i][2] == chosent and X[i][3] == chosend:
            data.append(X[i])

    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    #calculating the time between adjacent stops
    result = loaded_model.predict(data)
    total = 0
    for j in range(len(result)):
        total += (result[j])
    seconds = (total // len(result))

    #calculating the number of stops between origin and destination
    df = pd.read_csv('stops.csv')
    arr = df.values
    list = []
    for j in range(len(arr)):
        if (arr[j][0]) == chosenorig or (arr[j][0]) == chosendest:
            list.append(arr[j])

    list1 = []
    for i in range(len(list)):
        if (list[i][4]) == chosenro1:
            list1.append(list[i])
    nums = abs(list1[0][6] - list1[1][6])
    second = seconds * nums
    times = str(datetime.timedelta(seconds=second))
    #calculating the predict time between origin and destination
    #list =[]
    #for i in range(len(data)):
    #if data[i][1]==chosenorig:
    #list.append(i)
    #if data[i][1]==chosendest:
    #list.append(i)

    #result = loaded_model.predict(data[list[0]:list[1]])
    #total = 0
    #for i in range(len(result)):
    #total += (result[i])
    #time = total//60
    #create variables for time +1 hour and time -1hour
    chosentime1 = chosent + 1
    chosentime2 = chosent - 1
    data1 = []
    for i in range(0, len(X)):
        if X[i][0] == chosenro and X[i][2] == chosentime1 and X[i][
                3] == chosend:
            data1.append(X[i])

    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    #calculating the time between adjacent stops
    result1 = loaded_model.predict(data1)
    total1 = 0
    for j in range(len(result1)):
        total1 += (result1[j])
    seconds = (total1 // len(result1))
    second1 = seconds * nums
    time1 = str(datetime.timedelta(seconds=second1))

    # model 3
    data2 = []
    for i in range(0, len(X)):
        if X[i][0] == chosenro and X[i][2] == chosentime1 and X[i][
                3] == chosend:
            data2.append(X[i])

    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    #calculating the time between adjacent stops
    result2 = loaded_model.predict(data2)
    total2 = 0
    for j in range(len(result2)):
        total2 += (result2[j])
    seconds = (total2 // len(result2))
    second2 = seconds * nums
    time2 = str(datetime.timedelta(seconds=second2))

    return render_template("display.html",
                           chosenroute=chosenroute,
                           chosenorigin=chosenorigin,
                           chosendestination=chosendestination,
                           chosent=chosent,
                           chosentime1=chosentime1,
                           chosentime2=chosentime2,
                           times=times,
                           time1=time1,
                           time2=time2)
Exemplo n.º 53
0
#import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# load data
datas = pd.read_csv("maaslar.csv")

x = datas.iloc[:, 1:2]
y = datas.iloc[:, 2:]

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    n_estimators=10,
    random_state=0)  # n_estimators : how many decision trees will be created
rfr.fit(x, y)

print(rfr.predict(6.6))
print(rfr.predict(11))
z = x + 0.5
k = x - 0.5

plt.scatter(x, y, color="red")
plt.plot(x, rfr.predict(x), color="blue")
plt.plot(x, rfr.predict(z), color="green")
plt.plot(x, rfr.predict(k), color="yellow")

from sklearn.metrics import r2_score
print(r2_score(y, rfr.predict(x)))
#compare actual vs predicted values
df_output = pd.DataFrame({'Actual': test_y, 'Predicted': pred})
df_output

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (metrics.mean_absolute_error(test_y, pred) / test_y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy of Linear Regression:', round(accuracy, 2), '%.')


###################################################################################################################
# random forest model
###################################################################################################################
model = RandomForestRegressor()
model.fit(train_x,train_y)

# Get the mean absolute error on the test data :
pred = model.predict(test_x)

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (metrics.mean_absolute_error(test_y, pred) / test_y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy of Random Forest Regressor:', round(accuracy, 2), '%.')

###################################################################################################################
#XGBoost Model
###################################################################################################################
XGBModel = xgb.XGBRegressor()
XGBModel.fit(train_x,train_y , verbose=False)
Exemplo n.º 55
0
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4,
                       n_informative=2,
                       random_state=0,
                       shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)

regr.fit(X, y)
X

print(regr.feature_importances_)

print(regr.predict([[0, 0, 0, 0]]))

for i in range(20):
    l = list(np.round(np.random.random(4), 2))
    print(l, '      ', np.round(regr.predict([l]), 2))
Exemplo n.º 56
0
            'RAD': RAD,
            'TAX': TAX,
            'PTRATIO': PTRATIO,
            'B': B,
            'LSTAT': LSTAT}
    features = pd.DataFrame(data, index=[0])
    return features

df = user_input_features()

# Main Panel

# Print specified input parameters
st.header('Specified Input parameters')
st.write(df)
st.write('---')

# Build Regression Model
model = RandomForestRegressor()
model.fit(X, Y)
# Apply Model to Make Prediction
prediction = model.predict(df)

st.header('Prediction of MEDV')
st.write(prediction)
st.write('---')




Exemplo n.º 57
0
#split data into x and Y axis
x = ds.iloc[:, :-1].values
y = ds.iloc[:, 1].values

#Devide the dataset into training and test dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=1 / 4,
                                                    random_state=0)

# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
regressor.fit(x, y)

# Predicting a test result
y_pred = regressor.predict(x_test)

# Visualising the Random Forest Regression results (higher resolution)
X_grid = np.arange(min(x), max(x), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(x_train, y_train, color='green')
plt.scatter(x_test, y_test, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Random Forest Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Exemplo n.º 58
0
df_train_feature,df_test_feature, df_train_goal, df_test_goal = train_test_split(x, y, test_size=0.33, random_state=42)
#回归模型算法
#clf=GradientBoostingRegressor( learning_rate= 0.01,n_estimators=300)
#clf = LinearRegression()
clf = RandomForestRegressor(n_estimators=400)
#clf = Ridge(alpha=0.03)
#clf = Lasso()
#clf =SVR()
#神经网络,双隐层,一层28神经元,二层16神经元
# model = Sequential()
# model.add(Dense(28,activation='relu',input_dim=10))
# model.add(Dense(16,activation='relu'))
# model.add(Dense(1))
# model.compile(optimizer='adam',loss='mse')
# model.fit(df_train_feature,df_train_goal,epochs=1000,batch_size=4,verbose=2)
clf.fit(df_train_feature, df_train_goal)
#验证数据集
x_value = df_value[['AQI指数','PM2.5','PM10','So2','No2','Co','O3','最高气温','最低气温']]
#x_value = df_value[['AQI指数','So2','No2','O3','最低气温']]
y_value = df_value['急诊人次']
#开始进行数据预测
prediction = clf.predict(x_value)
prediction = pd.DataFrame(prediction)
#处理验证集数据
y_value= y_value.reset_index()
y_value = y_value['急诊人次']
#评价指标
r2 = r2_score(y_value,prediction)
print(r2)
df_wucha = pd.concat([prediction,y_value],axis =1)
df_wucha.columns = ['预测急诊人次','实际急诊人次']
Exemplo n.º 59
0
# COMMAND ----------

X_train, X_test, y_train, y_test = train_test_split(
    d[['SoPOR', 'SoPDel']],
    d['SOBD_CSEVAD_Difference'],
    test_size=0.33,
    random_state=42)

# COMMAND ----------

# Create linear regression object
regr = RandomForestRegressor(max_depth=10)

# Train the model using the training sets
regr.fit(X_train, y_train)

# COMMAND ----------

# Make predictions using the testing set
y_pred = regr.predict(d[['SoPOR', 'SoPDel']])

# COMMAND ----------

import pyspark
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.ml.feature import *

# COMMAND ----------
#----------------------------------------------------------------------------------------------------

#Array Creation for med1
x1 = list()
y1 = list()
for i in range(len(med1)-10):
    x = np.array(med1.loc[i:i+9])
    y = np.array(med1.loc[i+10])
    x1.append(x)
    y1.append(y)
x1 = np.array(x1).reshape(292,10)
y1 = np.array(y1).reshape(292,)

#Random Forests 85%
reg = RandomForestRegressor(max_depth=10,random_state=10)
reg.fit(x1,y1)
print(reg.score(x1,y1))
p = reg.predict(x1)

#Save model1
filename = 'model1.sav'
pickle.dump(reg, open(filename, 'wb'))
model1 = pickle.load(open(filename, 'rb'))

#----------------------------------------------------------------------------------------------------

# Array Creation for med2
x2 = list()
y2 = list()
for i in range(len(med2)-10):
    x = np.array(med2.loc[i:i+9])