def make_mf_regression(X ,y, qid, X_test, n_round=5,batch_size=1024*6,nb_epoch=10): u,i = X n = u.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor ''' mf_tr = np.zeros(u.shape[0]) mf_te = np.zeros(X_test[:,0].shape[0]) for i in range(n_round): skf = KFold(n, n_folds=2, shuffle=True, random_state=42+i*1000) for ind_tr, ind_te in skf: clf = build_model() u_tr = u[ind_tr] u_te = u[ind_te] i_tr = i[ind_tr] i_te = i[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit([u_t,i_tr], y_tr, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, shuffle=True, validation_data=([u_t,i_te],y_te) ) mf_tr[ind_te] += clf.predict(X_te).ravel() mf_te += clf.predict(X_test).ravel()*0.5 y_pred = np.clip(clf.predict(X_te).ravel(),1,3) score = rmse(y_te, y_pred) print('round',i,'finished') # print 'pred[{}] score:{}'.format(i, score) return (mf_tr / n_round, mf_te / n_round)
def run(self, hyper_classifier, training_data, training_target, testing_data, testing_target): ''' TODO DOCUMENTATION ''' results = {'name': self.name, 'parameterization': self.parameterization, 'exception': None} try: self.classifier = hyper_classifier.make_classifier(training_data, training_target, **self.parameterization) self.classifier.fit(training_data, training_target) results['predicted'] = self.classifier.predict(testing_data) except MemoryError as e: raise e except Exception as e: print(repr(e)) results['exception'] = e else: # attempt to save memory del(self.classifier) self.classifier = None results['ml_metric_ce'] = ml_metrics.ce(testing_target, results['predicted']) results['ml_metric_rmse'] = ml_metrics.rmse(testing_target, results['predicted']) results['sklearn_metric_accuracy'] = sklearn.metrics.accuracy_score(testing_target, results['predicted']) results['sklearn_metric_f1'] = sklearn.metrics.f1_score(testing_target, results['predicted']) results['sklearn_metric_precision'] = sklearn.metrics.precision_score(testing_target, results['predicted']) results['sklearn_metric_recall'] = sklearn.metrics.recall_score(testing_target, results['predicted']) results['ml_metric_auc'] = {} results['sklearn_metric_auc'] = {} for label in set(testing_target): binary_testing_target = np.array(map(lambda x: 1 if x == label else 0, testing_target)) binary_predicted = np.array(map(lambda x: 1 if x == label else 0, results['predicted'])) results['ml_metric_auc'][label] = ml_metrics.auc(binary_testing_target, binary_predicted) results['sklearn_metric_auc'][label] = sklearn.metrics.auc_score(binary_testing_target, binary_predicted) return results
def score(pred, y): ''' 给最后测试结果打分,根据不同的标准,这里需要每次都改 ''' metric = rmse(y, pred) print(metric) return metric
def compare(a, b): m = pc.from_file(a).points n = pc.from_file(b).points m = [tuple(m.x), tuple(m.y), tuple(m.z)] m = m[0] n = [tuple(n.x), tuple(n.y), tuple(n.z)] n = n[0] v1, v2 = verify_rmse(m, n), rmse(m, n) print(v1, v2)
def custom_valid_scheme(model, train, valid, feats, target, agg_function, early_stopping=5, val_at_num_epoch=5): def _train(X_train, model, y_train, iteration): # TODO: Train based on model from previous iteration instead of from scratch (although it's not a real bottleneck) model.get_model().set_params(n_estimators=(iteration * val_at_num_epoch)) model.fit(X_train, y_train) extract_test_func = lambda df: df[df['split'] == 'valid'] X_train = train[feats] y_train = train[target] epochs_without_improvement = 0 best_score = np.inf best_iter = 0 iter = 1 while epochs_without_improvement < early_stopping: _train(X_train, model, y_train, iter) new_valid = predict_one_by_one(train=train, test=valid, feats=feats, model=model, agg_function=agg_function, extract_test_func=extract_test_func) score = rmse(valid[target].values, new_valid[target].values) print(f'RMSE on valid: {score}') if score < best_score: best_score = score best_iter = iter epochs_without_improvement = 0 else: epochs_without_improvement += 1 iter += 1 model.get_model().set_params(n_estimators=(best_iter * val_at_num_epoch)) model.fit(X_train, y_train) print( f'score didn\'t improve for {epochs_without_improvement} epochs - finished training with best score of {best_score}' ) return best_score
def make_mf_regression(X, y, qid, X_test, n_round=5, batch_size=1024 * 6, nb_epoch=10): u, i = X n = u.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor ''' mf_tr = np.zeros(u.shape[0]) mf_te = np.zeros(X_test[:, 0].shape[0]) for i in range(n_round): skf = KFold(n, n_folds=2, shuffle=True, random_state=42 + i * 1000) for ind_tr, ind_te in skf: clf = build_model() u_tr = u[ind_tr] u_te = u[ind_te] i_tr = i[ind_tr] i_te = i[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit([u_t, i_tr], y_tr, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, shuffle=True, validation_data=([u_t, i_te], y_te)) mf_tr[ind_te] += clf.predict(X_te).ravel() mf_te += clf.predict(X_test).ravel() * 0.5 y_pred = np.clip(clf.predict(X_te).ravel(), 1, 3) score = rmse(y_te, y_pred) print('round', i, 'finished') # print 'pred[{}] score:{}'.format(i, score) return (mf_tr / n_round, mf_te / n_round)
mod1) #checking the data points which are influencing e_new = e_toy.drop( e_toy.index[[80, 960, 221]], axis=0 ) #Looks like 80,860,221 data points are influencing, Hence we remove it. mod1_new = smf.ols( 'Price ~ Age+KM+HP+cc+Dr+gr+Qt+Wt', data=e_new).fit() #Applying model1 for the newly created data set mod1_new.summary( ) #Looks good here as all the variable's p values are below 0.05 act1 = e_new.Price #sm.graphics.plot_partregress_grid(mod1_new) #Prdicting Prices using mod1 pred1 = mod1_new.predict(e_new) #Predicting the price using model1 rootmse = rmse(pred1, e_new.Price) #calculating the root mean square error rootmse # = 1227.473986005888 df = pd.DataFrame(list(zip(pred1, act1)), columns=[ 'Predicted Prices', 'Actual Prices' ]) #creating the data set of predicted and actual prices. df ''' ############################################################## BUilding Model 2 ####################################################################################################################################################################################################################################################################################################################################################### mod2 = smf.ols('Price ~ np.log(Age)+KM+HP+cc+Dr+gr+Qt+Wt',data=e_new).fit() mod2.summary() #Since the p values for cc and Dr are above than 0.05, so lets check for significance mod_2d = smf.ols('Price~Dr',data=e_new).fit()#applying model 2 for only Dr against price mod_2d.summary()#Shows that it is significant #e_new2 = e_new2.drop(['Dr'], axis = 1)
# Discard the label row # if sample[0] == 'gas [m3]': continue label, x = sample[0], sample[1:] # # Insert the sample into the Dataset # tstdata.appendLinked(x, label) # # Write the output of the final network # predictedA, actualA = predict(n, tstdata['input'], tstdata['target']) print "MAPE: ", mape(actualA, predictedA) print "RMSE: ", metrics.rmse(actualA, predictedA) print "MAE: ", metrics.mae(actualA, predictedA) data = [["actual", "predicted"]] data.extend(np.hstack([actualA, predictedA])) with open('results/'+filename, 'w') as fp: a = csv.writer(fp, delimiter=',') a.writerows(data)
import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import preprocessing from ml_metrics import rmse # loading the data Computerdata =pd.read_csv("D:\\ExcelR Data\\Assignments\\Multi linear Regration\\Computer_Data.csv") Le = preprocessing.LabelEncoder() ##Label encoder() using for levels of categorical features into numerical values Computerdata['Cd'] = Le.fit_transform(Computerdata['cd']) Computerdata = Computerdata.drop('cd',axis = 1) Computerdata['Multi'] = Le.fit_transform(Computerdata['multi']) Computerdata = Computerdata.drop('multi',axis = 1) Computerdata['Premium'] = Le.fit_transform(Computerdata['premium']) Computerdata = Computerdata.drop('premium',axis = 1) Computerdata.describe() sns.pairplot(Computerdata) Computerdata.columns Computerdata.corr()#Correlation of coeficent import statsmodels.formula.api as smf #Building a model #To predict the price of computers,here I'm adding speed+hd+ram+screen+ads+trend+Cd+Multi+Premium against the Price Model=smf.ols("price~speed+hd+ram+screen+ads+trend+Cd+Multi+Premium",data=Computerdata).fit() Model.params Model.summary() #From my first model I got each and every variables as significant which means P-value less than 0.05 #so here i'm predicting the price of computers from my model Pred=Model.predict(Computerdata) Pred rootmse = rmse(Pred,Computerdata.Pr) rootmse
print "Number of training patterns: ", len(DS) print "Input and output dimensions: ", DS.indim, DS.outdim print "number of units in hidden layer: ", nNeurons # # Build network with # n = buildNetwork(nFeatures, nNeurons, nOutput) trainer = BackpropTrainer( n, dataset=DS, verbose=True,momentum=0.01) # # Training graph # graph = [("training", "test")] a,b=predict(n, tstdata['input'], tstdata['target']) bestValError = metrics.rmse(a,b) epochsThreshold = 12 epochsCount = 0 bestweights = trainer.module.params.copy() i = 0 while True: trainer.trainEpochs(1) predictedA, actualA = predict(n, DS['input'], DS['target']) trainingError = metrics.rmse(actualA, predictedA) predictedA, actualA = predict(n, tstdata['input'], tstdata['target']) validationError = metrics.rmse(actualA, predictedA) if validationError > bestValError: epochsCount = epochsCount+1 else:
for i in tqdm(range(N)): logging.info('Starting batch {}'.format(i)) data = features.make_train_batch(i) logging.info('Got data') X = data.drop(dropped_cols, 1) y = data.adjusted_demand logging.info('Training...') cls.fit(X, y) logging.info('Trained!') ys = [] y_preds = [] for i in tqdm(range(N)): data = features.make_test_batch(i) X = data.drop(dropped_cols, 1) ys.append(data.adjusted_demand) y_pred = np.maximum(cls.predict(X), 1) y_preds.append(y_pred) y = np.concatenate(ys) y_pred = np.concatenate(y_preds) del ys, y_preds print(y_pred.shape) print(y.shape) print(y_pred[:10]) print(y[:10]) print(ml_metrics.rmse(y, y_pred)) print(ml_metrics.rmsle(y, y_pred)) print(pandas.Series(cls.coef_, index=X.columns).sort_values())
def test_rmse(self): self.assertAlmostEqual(metrics.rmse(range(0,11), range(1,12)), 1) self.assertAlmostEqual(metrics.rmse([0,.5,1,1.5,2], [0,.5,1,1.5,2]), 0) self.assertAlmostEqual(metrics.rmse(range(1,5), [1,2,3,5]), 0.5)
#predicting the price from model6 pred6 = Model6.predict(TC_New) pred6 #from this model6 i can see pvalues for each and every variable which is having less than 0.05 and R^2 values i got 0.852 #so i can say R^2 value hasn't improved #Here i'm bulding a table to see which model having higher R^2 value values = list([ Model1.rsquared, Model2.rsquared, Model3.rsquared, Model4.rsquared, Model5.rsquared, Model6.rsquared ]) coded_variables = list([ 'Model1.rsquared', 'Model2.rsquared', 'Model3.rsquared', 'Model4.rsquared', 'Model5.rsquared', 'Model6.rsquared' ]) variables = list( ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5', 'Model 6']) Rsquared_model = pd.DataFrame( list(zip(variables, coded_variables, values)), columns=['Models', 'Variabels Named in the code', 'R^Squared Values']) Rsquared_model #From my analysis I can say my Model3 is better one compare to other models,which having higher R^2 value (0.8789) and from that Model3 I didn't removed any variables because all variables having less than 0.05 P-values #finally am checking the root mean square error for my pred3 data with actual data called TC_new import statsmodels.api as sm rootmse = rmse(pred3, TC_new.Price) rootmse #Out[215]: 1227.2689621781449 #Next i'm dataframing predicted data with actual data and store it in a new variable called df Actual = TC_new.Price df = pd.DataFrame(list(zip(pred3, Actual)), columns=['Predicted Prices', 'Actual Prices'])
def scores(actuals, predicteds): rmses = rmse(actual=actuals, predicted=predicteds) mses = mean_squared_error(actuals, predicteds) maes = mae(actual=actuals, predicted=predicteds) r2s = r2_score(actuals, predicteds) return rmses, maes, r2s, mses
def rmse_loss(self,y,y_pred): return rmse(y,y_pred)
random_state=seed) X = np.concatenate([X_train, X_test, test]) user_le = LabelEncoder() item_le = LabelEncoder() user_le.fit(X[:, 0]) item_le.fit(X[:, 1]) X_train[:, 0] = user_le.transform(X_train[:, 0]) X_train[:, 1] = item_le.transform(X_train[:, 1]) if use_all: X = np.concatenate([X_train, X_test]) y_train = np.concatenate([y_train, y_test]) X_train[:, 0] = user_le.transform(X[:, 0]) X_train[:, 1] = item_le.transform(X[:, 1]) X_test[:, 0] = user_le.transform(X_test[:, 0]) X_test[:, 1] = item_le.transform(X_test[:, 1]) rf = RandomForestRegressor(n_estimators=100, max_depth=12, n_jobs=7, random_state=seed) rf.fit(X_train, y_train) y_preds = rf.predict(X_test).ravel() score = rmse(y_test, y_preds) print('rf rmse score', score)
rsq_Ad = smf.ols('Ad~MS+RD',data=e_stu).fit().rsquared# RD and MS against Ad vif_Ad = 1/(1-rsq_Ad)# = 3.04709935040856, Hence significant d1 = {'Variables':['RD','MS','Ad'],'VIF':[vif_RD,vif_MS,vif_Ad]}# Combining the vif values wrt its variables Vif_frame = pd.DataFrame(d1)# To a data frame Vif_frame sm.graphics.plot_partregress_grid(mod1_new)#Plotting regression models to check which variables explaining the most fmod1_new = smf.ols('Pr~RD+MS',data=e_stu).fit()#We shall be removing Ad, even though it has feasible vif values, #it does'nt have feasible p values to model 1, Hence the model 1 is #created without Ad fmod1_new.summary()# Looks R Squared value of the model and the p values of variable are feasible pred1 = fmod1_new.predict(e_stu)#Predicting the price using model1 rootmse = rmse(pred1,e_stu.Pr)#calculating the root mean square error rootmse# = 7076.114277848526 act1 = e_stu.Pr df = pd.DataFrame(list(zip(pred1, act1)),columns =['Predicted Prices', 'Actual Prices']) df#created the data set of predicted and actual prices. #Creating a table for all the Rsquared Values of the diffrent models that was built during correction of influenicing poins in the data set. values = list([mod1.rsquared,mod1_new.rsquared,fmod1_new.rsquared]) coded_variables = list(['mod1.rsquared','mod1_new.rsquared','fmod1_new.rsquared']) variables = list(['Model 1','Model 1 New','Final Model 1']) #R_Squared_value_Of_models = {'Variables':[],'R^2 Value':[]} Rsquared_model = pd.DataFrame(list(zip(variables,coded_variables,values)),columns = ['Models','Variabels Named in the code','R^Squared Values']) Rsquared_model#Below is the table that shows how, on removing those outliers, R^Squared Value has improved. ''' Models Variabels Named in the code R^Squared Values 0 Model 1 mod1.rsquared 0.950746 1 Model 1 New mod1_new.rsquared 0.962343
#print(descriptionwithoutlinks[0]) #create matrix indptr = [0] indices = [] traindata = [] vocabulary = {} for dec in descriptionwithoutlinks: for words in dec: index = vocabulary.setdefault(words, len(vocabulary)) indices.append(index) traindata.append(1) indptr.append(len(indices)) matrix = csr_matrix((traindata, indices, indptr), dtype=float).toarray() length = len(matrix) X_train, X_test, y_train, y_test = train_test_split(matrix, views, test_size=0.33, random_state=42) #params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,'learning_rate': 0.01, 'loss': 'ls'} #clf = ensemble.GradientBoostingRegressor(**params) #clf.fit(X_train, y_train) #y_predicted=clf.predict(X_test) #print(len(y_test)) #print(len(y_predicted)) rf=LinearRegression() rf.fit(X_train,y_train) y_pred=rf.predict(X_test) print(rmse(y_test,y_pred))
def rmse_est(estimator, x, y): pred = estimator.predict(x) return -rmse(pred, y)
r1 = sm.stats.DescrStatsW(otg1diff[m:]) r2 = sm.stats.DescrStatsW(otg1diff[:m]) print 'p-value: ', sm.stats.CompareMeans(r1,r2).ttest_ind()[1] otg1diff.plot(figsize=(12,6)) fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(otg1diff.values.squeeze(), lags=25, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(otg1diff, lags=25, ax=ax2) src_data_model = otg[:'2013-05-26'] model = sm.tsa.ARIMA(src_data_model, order=(1,1,1), freq='W').fit(full_output=False, disp=0) print model.summary() q_test = sm.tsa.stattools.acf(model.resid, qstat=True) #свойство resid, хранит остатки модели #qstat=True, означает что применяем указынный тест к коэф-ам print DataFrame({'Q-stat':q_test[1], 'p-value':q_test[2]}) pred = model.predict('2013-05-26','2014-12-31', typ='levels') trn = otg['2013-05-26':] r2 = r2_score(trn, pred[1:32]) print 'R^2: %1.2f' % r2 metrics.rmse(trn,pred[1:32]) metrics.mae(trn,pred[1:32]) otg.plot(figsize=(12,6)) pred.plot(style='r--')
# Определяем p и q # fig = plt.figure(figsize=(12,8)) # ax1 = fig.add_subplot(211) # fig = sm.graphics.tsa.plot_acf(otg1diff.values.squeeze(), lags=25, ax=ax1) # ax2 = fig.add_subplot(212) # fig = sm.graphics.tsa.plot_pacf(otg1diff, lags=25, ax=ax2) p=1 q=1 d=1 # прогноз(Построение ARIMA модели) dat = '2018-11-30' src_data_model = otg[:dat] model = sm.tsa.ARIMA(src_data_model, order=(p,d,q), freq='M').fit(disp=0) print(model.summary()) # остатки модели и построение для них ACF q_test = sm.tsa.stattools.acf(model.resid, qstat=True) #свойство resid, хранит остатки модели, qstat=True, означает что применяем указынный тест к коэф-ам #Расчет коэффициента детерминации R^2 pred = model.predict(dat,'2018-12-31', typ='levels') trn = otg[dat:'2018-11-30'] l = len(trn)+1 r2 = r2_score(trn, pred[1:l]) print ('R^2: %1.2f' % r2) print("Среднеквадратичное отклонение нашей модели:") print(metrics.rmse(trn,pred[1:l])) print("Средняя абсолютная ошибка прогноза:") print(metrics.mae(trn,pred[1:l])) otg.plot() pred.plot(style='r--') plt.show()
print "number of units in hidden layer: ", nNeurons # # Build network with # n = buildNetwork(nFeatures, nNeurons, nOutput) trainer = BackpropTrainer( n, dataset=trainData, verbose=True,momentum=momentum) # # Training graph # graph = [("training", "test")] for i in range(0,nEpochs): trainer.trainEpochs(1) predictedA, actualA = predict(n, trainData['input'], trainData['target']) error = metrics.rmse(actualA, predictedA) predictedA, actualA = predict(n, tstdata['input'], tstdata['target']) error2 = metrics.rmse(actualA, predictedA) graph.append((i, error, error2)) with open('results/graphs/'+filename, 'w') as fp: a = csv.writer(fp, delimiter=',') a.writerows(graph) # # Write the output of the final network # predictedA, actualA = predict(n, tstdata['input'], tstdata['target'])
model_am=smf.ols("Pr~Ad+Ms",data=Startups).fit() model_am.summary() # Added varible plot sm.graphics.plot_partregress_grid(Model2) #Final Model Model3=smf.ols("Pr~Rd+Ms",data=Startups).fit() Model3.params Model3.summary() pred3=Model.predict(Startups_new) pred3 #Finally i'm going for Root mean square error(RMSE) to check the average error in my data set rootmse = rmse(pred3,Startups_new.Pr) rootmse Actual=Startups_new.Pr #Creating a dataframe set for actual and predicted price df = pd.DataFrame(list(zip(pred3, Actual)),columns =['Predicted Prices', 'Actual Prices']) #Next i'm going for to create a r^2 value table for my three models values = list([Model1.rsquared,Model2.rsquared,Model3.rsquared])#R^2 values coded_variables = list(['Model1.rsquared','Model2.rsquared','Model3.rsquared'])# variables = list(['Model 1','Model 2','Model 3']) Rsquared_model = pd.DataFrame(list(zip(variables,coded_variables,values)),columns = ['Models','Variabels Named in the code','R^Squared Values']) Rsquared_model Models Variabels Named in the code R^Squared Values 0 Model 1 Model1.rsquared 0.950746 1 Model 2 Model2.rsquared 0.950746 2 Model 3 Model3.rsquared 0.950450
def rmse_loss(self, y, y_pred): return rmse(y, y_pred)
# determine test indices test_idxs = perms[i] train_ds = SupervisedDataSet(nFeatures, nOutput) train_ds.setField("input" , inp[train_idxs]) train_ds.setField("target" , tar[train_idxs]) # # Build network with # n = buildNetwork(nFeatures, nNeurons, nOutput) trainer = BackpropTrainer( n, dataset=train_ds, verbose=True,momentum=0.01) trainer.trainEpochs(nEpochs) predictedA, actualA = predict(n, inp[test_idxs], tar[test_idxs]) performances += metrics.rmse(actualA, predictedA) print "CROSSVALIDATOR: ", performances/n_folds '''predictedA, actualA = predict(n, tstdata['input'], tstdata['target']) print "MAPE: ", mape(actualA, predictedA) print "RMSE: ", metrics.rmse(actualA, predictedA) print "MAE: ", metrics.mae(actualA, predictedA) data = [["actual", "predicted"]] data.extend(np.hstack([actualA, predictedA])) with open('results/'+filename, 'w') as fp:
tr_gen = X_train_generatetor(128,um,im,batch_size=batch_size,name='X_train.csv') te_gen = X_train_generatetor(128,um,im,batch_size=batch_size,name='X_test.csv') for X_tr,y_tr in tr_gen: p_tr = model.predict_on_batch(X_tr) y_tr_preds.append(p_tr) for X_te,y_te in te_gen: p_te = model.predict_on_batch(X_te) y_te_preds.append(p_te) y_tr_preds = np.concatenate(y_tr_preds).ravel() y_te_preds = np.concatenate(y_te_preds).ravel() train_score = rmse(y_train,y_tr_preds) print('rmse train',train_score) test_score = rmse(y_test,y_te_preds) print('rmse test',test_score) print('Start Training') for epoch in range(nb_epoch): tr_gen = X_train_generatetor(128,um,im,batch_size=batch_size,name='X_train.csv') te_gen = X_train_generatetor(128,um,im,batch_size=batch_size,name='X_test.csv') # train # y_tr_preds = [] # y_te_preds = [] start_time = time.time() for X_tr,y_tr in tr_gen: model.train_on_batch(X_tr,y_tr)
ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(otg1diff.values.squeeze(), lags=25, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(otg1diff, lags=25, ax=ax2) print 'otg' print otg src_data_model = otg[:'2015-11-09 03:26:26'] model = sm.tsa.ARIMA(src_data_model, order=(4,1,0)).fit() #trend='nc' if need print model.summary() q_test = sm.tsa.stattools.acf(model.resid, qstat=True) print DataFrame({'Q-stat':q_test[1], 'p-value':q_test[2]}) pred = model.predict('2015-11-09 03:26:16','2015-11-09 03:29:06', typ='levels') trn = otg['2015-11-09 03:26:26':] r2 = r2_score(trn, pred) print 'R^2: %1.2f' % r2 #mean-square rmse metrics.rmse(trn,pred) metrics.mae(trn,pred) fig, (ax1) = plt.subplots(nrows = 1, sharex=True) ax1.plot(otg.index,otg.values) ax1.plot_date(pred.index,pred.values,'r--') plt.show() #print pred.values
model.resid, qstat=True ) #свойство resid, хранит остатки модели, qstat=True, означает что применяем указынный тест к коэф-ам print(DataFrame({'Q-stat': q_test[1], 'p-value': q_test[2]})) # prediction # pred = model.predict(start=src_data_model.shape[0], end=src_data_model.shape[0]+100) pred = model.predict(start='2017-10-25 00:00:00', end='2017-10-30 00:00:00') trn = p['2017-10-25 00:00:00':'2017-10-30 00:00:00'] print(pred) # pred.plot(figsize=(12, 8), color='red') plt.show() # r2 = r2_score(trn, pred[1:32]) # print('R_2= %1.2f' % r2) # RMSE for ARIMA rmse = metrics.rmse(trn, pred) print(rmse) print(type(rmse)) # MAE for ARIMA mae = metrics.mae(trn, pred) print(mae) scale = 1 deviation = float(rmse) lower = pred - deviation * scale lowerDF = pd.DataFrame({'Box': lower.values}, index=lower.index) print(lowerDF) lower_arr = lowerDF.as_matrix().squeeze() p_arr = p.loc[lowerDF.index].as_matrix().squeeze() pred_arr = pred.loc[lowerDF.index].as_matrix().squeeze()
def test_rmse(self): self.assertAlmostEqual(metrics.rmse(range(0, 11), range(1, 12)), 1) self.assertAlmostEqual( metrics.rmse([0, .5, 1, 1.5, 2], [0, .5, 1, 1.5, 2]), 0) self.assertAlmostEqual(metrics.rmse(range(1, 5), [1, 2, 3, 5]), 0.5)
print 'p-value: ', sm.stats.CompareMeans(r1, r2).ttest_ind()[1] otg1diff.plot(figsize=(12, 6)) fig = plt.figure(figsize=(12, 8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(otg1diff.values.squeeze(), lags=25, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(otg1diff, lags=25, ax=ax2) src_data_model = otg[:'2013-05-26'] model = sm.tsa.ARIMA(src_data_model, order=(1, 1, 1), freq='W').fit(full_output=False, disp=0) print model.summary() q_test = sm.tsa.stattools.acf( model.resid, qstat=True) #свойство resid, хранит остатки модели #qstat=True, означает что применяем указынный тест к коэф-ам print DataFrame({'Q-stat': q_test[1], 'p-value': q_test[2]}) pred = model.predict('2013-05-26', '2014-12-31', typ='levels') trn = otg['2013-05-26':] r2 = r2_score(trn, pred[1:32]) print 'R^2: %1.2f' % r2 metrics.rmse(trn, pred[1:32]) metrics.mae(trn, pred[1:32]) otg.plot(figsize=(12, 6)) pred.plot(style='r--')
X_train,y_train = resample(X_train,y_train,n_samples = X_train.shape[0]/10, random_state =seed) X_test,y_test = resample(X_test,y_test,n_samples = X_test.shape[0]/10, random_state =seed) X = np.concatenate([X_train,X_test,test]) user_le = LabelEncoder() item_le = LabelEncoder() user_le.fit(X[:,0]) item_le.fit(X[:,1]) X_train[:,0] = user_le.transform(X_train[:,0]) X_train[:,1] = item_le.transform(X_train[:,1]) if use_all: X = np.concatenate([X_train,X_test]) y_train = np.concatenate([y_train,y_test]) X_train[:,0] = user_le.transform(X[:,0]) X_train[:,1] = item_le.transform(X[:,1]) X_test[:,0] = user_le.transform(X_test[:,0]) X_test[:,1] = item_le.transform(X_test[:,1]) rf = RandomForestRegressor(n_estimators = 100,max_depth=12,n_jobs=7,random_state=seed) rf.fit(X_train,y_train) y_preds = rf.predict(X_test).ravel() score = rmse(y_test,y_preds) print('rf rmse score',score)
def rmse(self): actual = self.get_series() fitted = self.get_fitted_values() return ml_metrics.rmse(actual, fitted)
fig, axes = plt.subplots(ncols=1, figsize=(5,4)) TestModels.R2_Y1.plot( kind='bar', title='R2 metrics for different models - Random Forest wins') #TestModels.R2_Y2.plot(ax=axes[1], kind='bar', color='green', title='R2_Y2') #random_forest is the best -> model = models[1] model.fit(Xtrn, Ytrn) #model.summary() inf=model.feature_importances_ print inf #then predict with #print trn res=model.predict(trn) print(res) #print trg print "rmse" m = metrics.rmse(trg.sure.values,res) err = metrics.mae(trg.sure.values,res) print m print "mean_abs_err" print err print "in persentage:" print(100.0*err/(trg.sure.values.max() - trg.sure.values.min())) #plot.grid() #plt.show() fig = plt.figure() from mpl_toolkits.mplot3d import Axes3D ax = Axes3D(fig) ys = t1.dpr.values