def test_rmsle(self): self.assertAlmostEqual(metrics.rmsle(np.exp(2) - 1, np.exp(1) - 1), 1) self.assertAlmostEqual( metrics.rmsle([0, .5, 1, 1.5, 2], [0, .5, 1, 1.5, 2]), 0) self.assertAlmostEqual( metrics.rmsle([1, 2, 3, np.exp(1) - 1], [1, 2, 3, np.exp(2) - 1]), 0.5)
def score_rmsle(self,df,df_true): """ Calculate CV score of predictions in given dataframe using RMSLE metric. Score individually for each target and total for targets. Must have df_true loaded prior to running. """ all_true = [] all_preds = [] target_scores = [] #Transform predictions back to normal space for scoring self.transform_targets_exp() for target in self.targets: all_true.append(df_true[target].tolist()) all_preds.append(df[target].tolist()) target_score = ml_metrics.rmsle(df_true[target], df[target]) target_scores.append(target_score) utils.info('RMSLE score for %s: %f' % (target,target_score)) utils.info('Total RMSLE score: %f' % (ml_metrics.rmsle(all_true, all_preds))) #Transform predictions to log space again for averaging self.transform_targets_log()
def rmsle2(y, ypred): """ Calculate Root Mean Squared Logarithmic Error/ Note : Uses external library - ml_metrics - which is more stable than above version :param y: list (int, float) :param y_pred: list (int, float) :return: RMSLE score """ return mlmetrics.rmsle(y, ypred)
def score_rmsle(self, df, df_true): """ Calculate CV score of predictions in given dataframe using RMSLE metric. Score individually for each target and total for targets. Must have df_true loaded prior to running. """ all_true = [] all_preds = [] target_scores = [] #Transform predictions back to normal space for scoring self.transform_targets_exp() for target in self.targets: all_true.append(df_true[target].tolist()) all_preds.append(df[target].tolist()) target_score = ml_metrics.rmsle(df_true[target], df[target]) target_scores.append(target_score) utils.info('RMSLE score for %s: %f' % (target, target_score)) utils.info('Total RMSLE score: %f' % (ml_metrics.rmsle(all_true, all_preds))) #Transform predictions to log space again for averaging self.transform_targets_log()
def training_run(self, field_vals, model_class): cls = self.get_sklearn_like_model(model_class) train_data, test_data, score_data, non_score_data = self.read_dataset( field_vals) logging.info('Training') x, y, _ = make_xy(non_score_data) cls.fit(x, y) logging.info('Testing') tr_x, tr_y, _ = make_xy(test_data) # print(pandas.Series(cls.feature_importances_, index=tr_x.columns).sort_values()) pred = cls.predict(tr_x) return rmsle(pred, tr_y), cls
def cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget,mtxTestTarget,model): start_time = datetime.now() log.info('Temporal CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() train_cv = mtxTrn test_cv = mtxTest y_target = mtxTrnTarget y_true = mtxTestTarget #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #--------Hyperparameter optimization---------# #Make predictions try: model.estimator.fit(train_cv, y_target) preds = model.estimator.predict(test_cv) except TypeError: model.estimator.fit(train_cv.todense(), y_target) preds = model.estimator.predict(test_cv.todense()) #----------Post processing rules----------# #If target variable has been transformed, transform predictions back to original state preds = [np.exp(x)-1 for x in preds] #Apply scalar if model.postprocess_scalar != 1: preds = [x*model.postprocess_scalar for x in preds] #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes if model.target == 'num_votes': preds = [1 if x < 1 else x for x in preds] else: preds = [0 if x < 0 else x for x in preds] ##score the prediction by measuring the error using the chosen error metric score = ml_metrics.rmsle(y_true, preds) finish_time = datetime.now() log.info('Error Measure:' , score) log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(preds)), (np.max(preds),np.std(preds),np.min(preds),np.max(preds))) utils.line_break() log.info('Temporal CV completed at: %s. Total runtime: %s' \ % (datetime.now().strftime('%m-%d-%y %H:%M'),str(finish_time-start_time))) utils.line_break() return preds
def main(): logging.info('Getting sample data') unique_items = train.get_dataset_unique_items() data = train.read_dataset_sample(1, unique_items) logging.info('Calculating total dataset size') total_size = calc_dataset_size() logging.info('Building model') mdl, input_tensors, output_rvs = model.build_model(data, unique_items) minibatches = train.make_training_minibatch_iterator(unique_items) with mdl: logging.info('Doing ADVI batches...') v_params = pymc3.variational.advi_minibatch( n=100, minibatch_tensors=input_tensors, minibatch_RVs=output_rvs, minibatches=minibatches, total_size=total_size, n_mcsamples=10, verbose=True ) trace = pymc3.variational.sample_vp(v_params, draws=500) #print(pymc3.summary(trace)) plt.plot(v_params.elbo_vals) plt.savefig('./elbo.png') plt.show() test_frame = read_test_dataset(unique_items) with mdl: for i in range(0, test_frame.shape[0], 10000): samp = test_frame.ix[i:i+10000] frame_parts = frame_vector_split(samp) for t, v in zip(input_tensors, frame_parts): t.set_value(v) samples = pymc3.sample_ppc(trace, samples=500) print(ml_metrics.rmsle(samp.adjusted_demand, samples['adjusted_demand'].mean(axis=0)))
def cross_validate_using_benchmark(benchmark_name, dfTrn, mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15): fold_scores = [] SEED = SEED * time.localtime().tm_sec start_time = datetime.now() log.info('Benchmark CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() for i in range(folds): #For each fold, create a test set (test_holdout) by randomly holding out X% of the data as CV set, where X is test_size (default .15) train_cv, test_cv, y_target, y_true = cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=SEED*i+10) #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #Calc benchmarks and use them to make a prediction benchmark_preds = 0 if benchmark_name =='global_mean': benchmark_preds = [13.899 for x in test_cv] if benchmark_name =='all_ones': #find user avg stars mean benchmark_preds = [1 for x in test_cv] if benchmark_name =='9999': #find user avg stars mean benchmark_preds = [9999 for x in test_cv] log.info('Using benchmark %s:' % (benchmark_name)) #For this CV fold, measure the error score = ml_metrics.rmsle(y_true, benchmark_preds) #print score fold_scores += [score] log.info('RMSLE (fold %d/%d): %f' % (i + 1, folds, score)) ##Now that folds are complete, calculate and print the results finish_time = datetime.now() log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores))) utils.line_break() log.info('CV completed at: %s. Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'), str(finish_time-start_time))) utils.line_break()
def cross_validate_kfold(mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15,pred_fg='false'): fold_scores = [] SEED = SEED * time.localtime().tm_sec start_time = datetime.now() log.info('K-Fold CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() #If predictions are wanted, initialize the dict so that its length will match all records in the training set, #even if not all records are predicted during the CV (randomness is a bitch) if pred_fg == 'true': cv_preds = {key[0]:[] for key in mtxTrn.getcol(0).toarray()} for i in range(folds): ##For each fold, create a test set (test_cv) by randomly holding out test_size% of the data as CV set train_cv, test_cv, y_target, y_true = \ cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=i*SEED+1) #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #if predictions are wanted, parse off the first row from train and test cv sets. First row contains ID if pred_fg == 'true': #TODO: create dense matrix copies for the clf's that only use dense matrices train_cv = sparse.csr_matrix(train_cv)[:,1:] test_cv2 = sparse.csr_matrix(test_cv)[:,1:] test_cv = sparse.csr_matrix(test_cv)[:,1:] #----------Hyperparameter optimization------# try: model.estimator.fit(train_cv, y_target) preds = model.estimator.predict(test_cv) except TypeError: model.estimator.fit(train_cv.todense(), y_target) preds = model.estimator.predict(test_cv.todense()) preds = model.estimator.predict(test_cv) #----------Post processing rules----------# #If target variable has been transformed, transform predictions back to original state preds = [np.exp(x)-1 for x in preds] #Apply scalar if model.postprocess_scalar != 1: preds = [x*model.postprocess_scalar for x in preds] #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes if model.target == 'num_votes': preds = [1 if x < 1 else x for x in preds] else: preds = [0 if x < 0 else x for x in preds] ##For each fold, score the prediction by measuring the error using the chosen error metric score = ml_metrics.rmsle(y_true, preds) fold_scores += [score] log.info('RMLSE (fold %d/%d): %f' % (i + 1, folds, score)) ##IF we want to record predictions, then for each fold add the predictions to the cv_preds dict for later output if pred_fg == 'true': for i in range(0,test_cv2.shape[0]): if test_cv2.getcol(0).toarray()[i][0] in cv_preds.keys(): cv_preds[test_cv2.getcol(0).toarray()[i][0]] += [preds[i]] else: cv_preds[test_cv2.getcol(0).toarray()[i][0]] = [preds[i]] ##Now that folds are complete, calculate and print the results finish_time = datetime.now() log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores))) utils.line_break() log.info('K-Fold CV completed at: %s. Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'), str(finish_time-start_time))) utils.line_break() if pred_fg == 'true': return cv_preds
print ('Division_Set_Shapes:', X.shape, y.shape) print ('Validation_Set_Shapes:', X_train.shape, X_test.shape) params = {} params['objective'] = "reg:linear" params['eta'] = 0.025 params['max_depth'] = 5 params['subsample'] = 0.8 params['colsample_bytree'] = 0.6 params['silent'] = True print ('') test_preds = np.zeros(test.shape[0]) xg_train = xgb.DMatrix(X_train, label=y_train) xg_test = xgb.DMatrix(X_test) watchlist = [(xg_train, 'train')] num_rounds = 100 xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10) preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration) print ('RMSLE Score:', rmsle(y_test, preds)) fxg_test = xgb.DMatrix(test) fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1) test_preds += fold_preds submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds}) submission.to_csv('submission.csv', index=False)
def score(actual, prediction): return rmsle(actual, prediction.clip(0))
y_test_s=y_train[upto:upto2] X_train=[] y_train=[] #char_model.fit(X_train_s,y_train_s) print "training done" print countvect_char.fit_transform(X_train_s) #print countvect_char X_train=[] y_train=[] X_test=[] with open("data/test_review.csv") as fi: fir=csv.reader(fi) fir.next() for i in fir: X_test.append(i[4]) print "test data read" preds = char_model.predict(X_test_s) print "prediction done" X_test=[] #with open("predictions/tfidf_cngram.csv","wb") as fo: # fow=csv.writer(fo) #fow.writerows(preds) print "all done" print ml_metrics.rmsle(preds,y_test_s)
test_preds = np.zeros(test.shape[0]) xg_train = xgb.DMatrix(X_train, label=y_train) xg_test = xgb.DMatrix(X_test) watchlist = [(xg_train, 'train')] num_rounds = 100 xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval=evalerror, early_stopping_rounds=20, verbose_eval=10) preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration) print('RMSLE Score:', rmsle(y_test, preds)) fxg_test = xgb.DMatrix(test) fold_preds = np.around(xgclassifier.predict( fxg_test, ntree_limit=xgclassifier.best_iteration), decimals=1) test_preds += fold_preds submission = pd.DataFrame({'id': ids, 'Demanda_uni_equil': test_preds}) submission[["id", "Demanda_uni_equil"]].to_csv( '../submissions/' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv', index=False) print('done')
for i in tqdm(range(N)): logging.info('Starting batch {}'.format(i)) data = features.make_train_batch(i) logging.info('Got data') X = data.drop(dropped_cols, 1) y = data.adjusted_demand logging.info('Training...') cls.fit(X, y) logging.info('Trained!') ys = [] y_preds = [] for i in tqdm(range(N)): data = features.make_test_batch(i) X = data.drop(dropped_cols, 1) ys.append(data.adjusted_demand) y_pred = np.maximum(cls.predict(X), 1) y_preds.append(y_pred) y = np.concatenate(ys) y_pred = np.concatenate(y_preds) del ys, y_preds print(y_pred.shape) print(y.shape) print(y_pred[:10]) print(y[:10]) print(ml_metrics.rmse(y, y_pred)) print(ml_metrics.rmsle(y, y_pred)) print(pandas.Series(cls.coef_, index=X.columns).sort_values())
def test_rmsle(self): self.assertAlmostEqual(metrics.rmsle(np.exp(2)-1,np.exp(1)-1), 1) self.assertAlmostEqual(metrics.rmsle([0,.5,1,1.5,2], [0,.5,1,1.5,2]), 0) self.assertAlmostEqual(metrics.rmsle([1,2,3,np.exp(1)-1], [1,2,3,np.exp(2)-1]), 0.5)