def test_step(self, batch, batch_nb): # TODO: plot y, x, uids = (emiss, laser_params, uids) = batch x_pred = self(y) with torch.no_grad(): x_loss = rmse(x_pred, x) self.log("backward/test/x/loss", x_loss, prog_bar=True) if self.forward_model is not None: y_pred = self.forward_model(x_pred) y_loss = rmse(y_pred, y) self.log( "backward/test/y/loss", y_loss, prog_bar=True, ) loss = y_loss torch.save(x, "/data-new/alok/laser/params_true_back.pt") torch.save(y, "/data-new/alok/laser/emiss_true_back.pt") torch.save(y_pred, "/data-new/alok/laser/emiss_pred.pt") torch.save(x_pred, "/data-new/alok/laser/param_pred.pt") nngraph.save_integral_emiss_point( y_pred, y, "/data-new/alok/laser/backwards_test_points.txt", all_points=True) return loss
def training_step(self, batch, _batch_nb): # TODO: plot y, x, uids = (emiss, laser_params, uids) = batch x_pred = self(y) with torch.no_grad(): x_loss = rmse(x_pred, x) self.log("backward/train/x/loss", x_loss, prog_bar=True) if self.forward_model is not None: y_pred = self.forward_model(x_pred) y_loss = rmse(y_pred, y) self.log( "backward/train/y/loss", y_loss, prog_bar=True, ) loss = y_loss if self.current_epoch == self.config["backward_num_epochs"] - 5: nngraph.save_integral_emiss_point( y_pred, y, "/data-new/alok/laser/backwards_train_points.txt", all_points=True, ) self.log(f"backward/train/loss", loss, prog_bar=True) return loss
def evaluate(model, loader, task): model.eval() y_hat_list = [] y_list = [] for _ in range(loader.steps): graph_2d, graph_3d, y = loader.next_batch() y_hat = model(graph_2d, graph_3d) y_hat_list += y_hat.tolist() y_list += y.tolist() y_hat = np.array(y_hat_list) y = np.array(y_list) if task == 'regression': score = rmse(y, y_hat) else: auc_score_list = [] if y.shape[1] > 1: for label in range(y.shape[1]): true, pred = y[:, label], y_hat[:, label] # all 0's or all 1's if len(true[np.where(true >= 0)]) == 0: continue if len(set(true[np.where(true >= 0)])) == 1: auc_score_list.append(float('nan')) else: auc_score_list.append( roc_auc_score(true[np.where(true >= 0)], pred[np.where(true >= 0)])) score = np.nanmean(auc_score_list) else: score = roc_auc_score(y, y_hat) return score
def main(): results = init_results() for model_name, predictor in predictors.items(): for sku in configuration.SKUS: for period_ind in range(len(configuration.PERIODS)): period = configuration.PERIODS[period_ind] res_path = configuration.FORECAST_RES_DIR + model_name + "\\" + sku + "\\" + str( period_ind) end_of_period = period[1] real_series = loader.load_test_sku( sku, base_dir=configuration.BASE_DIR, end_of_period=end_of_period) train, test = train_test_split(real_series, configuration.N_PREDS) train = utils.remove_holidays(train) predictor.fit(train, configuration.N_PREDS) forecast = predictor.predict(configuration.N_PREDS) resid = predictor.resid forecast_scaled = utils.scale_by_max(forecast) test_scaled = utils.scale_by_max(test) save_plot(test_scaled, forecast_scaled, end_of_period, res_path) save_forecast_resid(forecast, resid, res_path) mape = utils.mape(y_true=test, y_pred=forecast) rmse = utils.rmse(y_true=test_scaled, y_pred=forecast_scaled) save_result(results, model_name, sku, period_ind, mape, rmse, predictor.describe())
def addnoise(name: str, zero_mean_gaussian_noise_sd: int = 5, percent_gaussian_impulse_noise: int = 5, impulse_noise_sd: int = 100): """ Create a noisy image percent_gaussian_impulse_noise percentage pixels in the image will have gaussian impulse noise having mean 128 and standard deviation as impulse_noise_sd The other 100 - percent_gaussian_impulse_noise percentage pixels in the image will be added with 0 mena gaussian noise having standard deviation as zero_mean_gaussian_noise_sd """ imagepath = os.path.join('images', 'original', name) img = imageio.imread(imagepath).astype(float) inp_img = np.array(img, dtype=float, copy=True) assert img.shape == (256, 256) percent_noise = percent_gaussian_impulse_noise / 100 for i in range(len(img)): for j in range(len(img)): r = random.uniform(0, 1) if r < percent_noise: img[i][j] = random.gauss(128, impulse_noise_sd) else: img[i][j] += random.gauss(0, zero_mean_gaussian_noise_sd) oppath = os.path.join( 'images', 'noisy', '{}_{}_{}_{}.png'.format(name[:-4], zero_mean_gaussian_noise_sd, percent_gaussian_impulse_noise, impulse_noise_sd)) img = np.clip(img, 0, 255) img = img.astype(np.uint8) imageio.imwrite(oppath, img) print('RMSE between generated noisy image and original image: {}'.format( rmse(inp_img, img)))
def predict(self, filename = 'result.txt'): y = [] y_pred = [] self.prediction = [] for i in range(len(self.test_x)): curr_frame = self.test_x[i] pred = self.model.predict(curr_frame[newaxis,:,:]) self.prediction.append(pred[0,-1]) if not self.norm: y.append(self.test_y[i,-1]) y_pred.append(pred[0,-1]) print self.test_x[i,:,-1],y[-1], y_pred[-1] else: test_x_inverse = self.scaler.inverse_transform(self.test_x[i,:,-1]) y.append(self.scaler.inverse_transform([self.test_y[i,-1]])[0]) y_pred.append(self.scaler.inverse_transform([pred[0,-1]])[0]) print test_x_inverse,y[-1], y_pred[-1] r = rmse(y,y_pred) print 'RMSE:', r with open(filename, 'a') as fout: fout.write('%s\t%s\t%.4f\n'%( self.companies, self.timeseries_type,r ))
def evaluate(self, test_data, scale): test_loader = generate_loader(path=test_data, scale=scale, train=False, batch_size=1, num_workers=1, shuffle=False, drop_last=False) HRs, SRs = list(), list() for _, inputs in enumerate(test_loader): HR = inputs[0].to(self.device) LR = inputs[1].to(self.device) with torch.no_grad(): SR = self.G(LR, scale).detach() HR = HR.cpu().clamp(0, 1).squeeze(0).permute(1, 2, 0).numpy() SR = SR.cpu().clamp(0, 1).squeeze(0).permute(1, 2, 0).numpy() HRs.append(HR) SRs.append(SR) rmse = utils.rmse(HRs, SRs, scale) lpips = utils.LPIPS(HRs, SRs, scale) return rmse, lpips
def main(): matplotlib.rcParams["figure.dpi"] = 200 matplotlib.rcParams["savefig.dpi"] = 600 sns.set(style="darkgrid") if len(sys.argv) > 1: algorithm = sys.argv[1] else: algorithm = "ASD" data = utils.read_netflix_data() mask = (data != 0) print("data shape:", data.shape) print("data density:", mask.sum() / data.size) rank = 50 iter_max = 10000 norm_tol = 1e-4 if algorithm == "sASD": minimize = ASD.scaled_alternating_steepest_descent else: # ASD minimize = ASD.alternating_steepest_descent results = minimize(data, rank, mask, iter_max, norm_tol, verbose=True) completed_data = results.matrix rmse = utils.rmse(data, completed_data, mask) print("RMSE:", rmse)
def xgb_boost_model(): df_all = pickle.load(open("../output/features/basic_features.pkl", 'r')) test_ind = df_all.relevance == -1 test_data = df_all[test_ind] train_data = df_all[~test_ind] test_data = test_data.drop(['relevance'], axis=1) le = preprocessing.LabelEncoder() le.fit(train_data['relevance']) ids = test_data['id'] train, test, hold_out = utils.split_dataset(train_data) relevant_columns =['title_similarity', 'product_desc_similarity', 'title_similarity_common', 'product_desc_similarity_common', 'description_length', 'search_length'] dTrain = xgb.DMatrix(train['X'][relevant_columns], label=train['Y']) dTest = xgb.DMatrix(test['X'][relevant_columns], label=test['Y']) dHold_out = xgb.DMatrix(hold_out['X'][relevant_columns], label=hold_out['Y']) dSubmit = xgb.DMatrix(test_data[relevant_columns]) param = {'bst:max_depth':5 , 'bst:eta':0.05, 'silent':1, 'objective':'reg:linear', 'eval_metric':'rmse'} evallist = [(dTest, 'eval'), (dTrain, 'train')] numRound = 200 bst = xgb.train(param, dTrain, numRound, evallist) predHoldout = bst.predict(dHold_out) print "Mean square hold out error ", utils.rmse(hold_out['Y'], predHoldout) predY = bst.predict(dSubmit) utils.debug_model(hold_out['X'], hold_out['Y'], predY)
def run_ridge_on_cat(cat): if not is_in_cache('cat_ridges_blend_l3_' + cat): print_step(cat + ' > Subsetting') train_c = train_[train['parent_category_name'] == cat].copy() test_c = test_[test['parent_category_name'] == cat].copy() print(train_c.shape) print(test_c.shape) target = train_c['deal_probability'].values train_id = train_c['item_id'] test_id = test_c['item_id'] train_c.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test_c.drop('item_id', axis=1, inplace=True) print_step(cat + ' > Modeling') results = run_cv_model(train_c, test_c, target, runLasso, params, rmse, cat + '-ridge-blend') train_c['cat_ridge'] = results['train'] test_c['cat_ridge'] = results['test'] print_step(cat + ' > RMSE: ' + str(rmse(target, train_c['cat_ridge']))) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step(cat + ' > Saving in Cache') train_c['item_id'] = train_id test_c['item_id'] = test_id save_in_cache('cat_ridges_blend_l3_' + cat, train_c[['item_id', 'cat_ridge']], test_c[['item_id', 'cat_ridge']]) return True else: print_step('Already have ' + cat + '...') return True
def eval(data, model_path): """ 학습이 완료된 matrix들을 loading하여 test_data에 대한 rmse 평가 :param data: list of test data :return: rmse """ pred_ratings, true_ratings = [], [] Q, P, Q_b, P_b, b = utils.load_each_matrix(model_path) userId2idx, movieId2idx = utils.load_id2idx(model_path) complete_matrix = b + P_b[:, np.newaxis] + Q_b[np.newaxis:, ] + P.dot(Q.T) with open(os.path.join(model_path, 'result.csv'), 'w', encoding='utf8') as f: for (user_id, movie_id, rating, timestamp) in data: true_ratings.append(rating) pred = complete_matrix[int(userId2idx[user_id]), int(movieId2idx[movie_id])] if pred < 0 or pred > 8: pred = b pred_ratings.append(pred) f.write( str(user_id) + ',' + str(movie_id) + ',' + str(pred) + ',' + str(timestamp)) return utils.rmse(pred_ratings, true_ratings)
def test(self, data): predicted = [] real = [] for movie, user, rating in data: if movie-1 < self.n_movies and user-1 < self.n_users: predicted.append(self.predictions[movie-1, user-1]) real.append(rating) return rmse(real, predicted)
def calculateMetrics(time_range,model,ovitrap_eggs_i): BS_a,vBS_d,m,n,OVIPOSITION=model.parameters.BS_a,model.parameters.vBS_d,model.parameters.m,model.parameters.n,model.parameters.OVIPOSITION Y=model.Y indexOf=lambda t: (np.abs(time_range-t)).argmin() lwO=np.array([ (Y[indexOf(t),OVIPOSITION]-Y[indexOf(t-7),OVIPOSITION]).reshape(m,n).sum(axis=0) for t in time_range])/(BS_a*vBS_d) lwO=lwO[:,0]#if multiple container w assume the first one is ovitrap and the rest are wild containers d=utils.rmse(ovitrap_eggs_i[ovitrap_eggs_i!=[None]], lwO[ovitrap_eggs_i!=[None]]) return d
def test_step(self, batch, batch_nb): x, y, uids = batch y_pred = self(x) loss = rmse(y_pred, y) self.log(f"forward/test/loss", loss, prog_bar=True) nngraph.save_integral_emiss_point( y_pred, y, "/data-new/alok/laser/forwards_val_points.txt", all_points=True ) return loss
def alternating_steepest_descent(z0, rank, mask, max_iter, norm_tol, verbose=False): begin = time.time() # Initialize U, s, V = np.linalg.svd(mask*z0, full_matrices=False) s[rank:] = 0 x = (U @ np.diag(s))[:,:rank] y = V[:rank,:] xy = x@y diff = mask*(z0 - xy) residuals = [] norm_z0 = norm(mask*z0) tenPowers = [10**k for k in range(10)] for num_iter in range(max_iter): grad_x = -diff @ y.T delta_xy = mask*(grad_x@y) tx = norm(grad_x)**2/norm(delta_xy)**2 x = x - tx*grad_x diff = diff + tx*delta_xy grad_y = -x.T @ diff delta_xy = mask*(x@grad_y) ty = norm(grad_y)**2/norm(delta_xy)**2 y = y - ty*grad_y diff = diff + ty*delta_xy residual = norm(diff)/norm_z0 if verbose: print(num_iter, residual) if num_iter % 1000 == 0: residuals.append(residual) if residual < norm_tol: break xy = x@y asd_time = time.time() - begin rmse = utils.rmse(z0, xy, mask) Result = namedtuple("Result", ["algorithm", "matrix", "time", "residual", "num_iterations", "rmse"]) result = Result(algorithm="ASD", matrix=x@y, time=asd_time, residual=residual, num_iterations=num_iter+1, rmse=rmse) if verbose: print("Algoritmo: ASD") print("Tiempo:", asd_time) print("Iteraciones:", num_iter) return result
def _calc_msve(self): """Calculates the MSVE between the true state-values and the current value-estimates The calculates MSVE is added to the `msve` list. """ v = [] for state in self._env.state_iterator(): feature_vector = self._features.vector(state) v.append(utils.state_value(feature_vector, self.theta)) self.msve.append(utils.rmse(v, self._true_values))
def loss_function(batch_x, batch_y): logits = model(batch_x, training=True) denorm_x = denorm(logits, _min, _max) denorm_y = denorm(batch_y, _min, _max) lossL2 = tf.add_n(model.losses) return rmse(denorm_x, denorm_y) + lossL2
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params): model = TFFMRegressor(**params) print_step('Fit TFFM') for i in range(rounds): model.fit(train_X, train_y.values, n_epochs=iters) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format( i + 1, rounds, rmse(pred_test_y, test_y))) print_step('TFFM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def validation_step(self, batch, batch_nb): x, y, uids = batch y_pred = self(x) loss = rmse(y_pred, y) randcheck = np.random.uniform() self.log(f"forward/val/loss", loss, prog_bar=True) if self.current_epoch > self.config["forward_num_epochs"] - 5: nngraph.save_integral_emiss_point( y_pred, y, "/data-new/alok/laser/forwards_val_points.txt", all_points=True ) return loss
def runFM(train_X, train_y, test_X, test_y, test_X2, params): params['D'] = train_X.shape[1] rounds = params.pop('rounds') model = FM_FTRL(**params) print_step('Fit FM') for i in range(rounds): model.fit(train_X, train_y, reset=False) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format(i + 1, rounds, rmse(pred_test_y, test_y))) print_step('FM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def training_step(self, batch, batch_nb): x, y, uids = batch y_pred = self(x) loss = rmse(y_pred, y) # nngraph.emiss_error_graph(y_pred, y, "train_step.png") # self.log_image(key="train_forwards_error_graphs", images=["train_step.png"]) if self.current_epoch == self.config["forward_num_epochs"] - 5: nngraph.save_integral_emiss_point( y_pred, y, "/data-new/alok/laser/forwards_train_points.txt", all_points=True ) self.log(f"forward/train/loss", loss, prog_bar=True) return loss
def evaluate(model, loader): model.eval() y_hat_list = [] y_list = [] for batch_data in loader: a2a_g, b2a_g, b2b_gl, feats, types, counts, y = batch_data _, y_hat = model(a2a_g, b2a_g, b2b_gl, types, counts) y_hat_list += y_hat.tolist() y_list += y.tolist() y_hat = np.array(y_hat_list).reshape(-1,) y = np.array(y_list).reshape(-1,) return rmse(y, y_hat), mae(y, y_hat), sd(y, y_hat), pearson(y, y_hat)
def forecasting(self): print 'Data len %.0f' %(len(self.series_norm)) print 'Forecasting...%s %s' %(self.model_type, self.feature_type) y = self.series_norm.ix[:, 0] x = self.series_norm.ix[:, 1:] errsfit = [] errsfor = [] start = 0 end = self.window_size while (end <= len(self.series_norm) - self.step_size): ytrain = y[start:end] ytest = y[end:end + self.step_size] fmodel = [] if self.model_type == 'ar': #print ytrain.shape, ytest.shape # ytrain = sm.add_constant(ytrain) fmodel = TSModel(endog=ytrain, method=self.model_type, steps=self.step_size, isnpa=False, verbose=False) elif self.model_type == 'var': xtrain = x[start:end] # feature filtering on segmented a constant variation. xtrain = xtrain.loc[:, (xtrain != xtrain.ix[0]).any()] fmodel = TSModel(endog=ytrain, feature=xtrain, method=self.model_type, steps=self.step_size, isnpa=False, verbose=False) # try: fmodel.fit_forecast() if fmodel.result is not None: efit = rmse(ytrain, fmodel.result['fit']) efor = rmse(ytest, fmodel.result['forecast']) # print '\t',model,start, end, efit, efor , ytest, fmodel.result['forecast'] #fmodel.result['fit'], errsfit.append(efit) errsfor.append(efor) start += self.step_size end += self.step_size print self.model_type, self.feature_type, self.companies, len(errsfit), len(errsfor), np.mean(errsfit), np.mean(errsfor)
def GetScores(actual, pred): """ get an RMS error for each model """ dim = len(pred.shape) if dim == 1: # --- dont account for NaNs valid_yhat = ~np.isnan(pred) score = utils.rmse(actual[valid_yhat], pred[valid_yhat]) else: score = [] for r in range(pred.shape[0]): # --- dont account for NaNs valid_yhat = ~np.isnan(pred[r]) score.append(utils.rmse(actual[valid_yhat], pred[r, valid_yhat])) score = np.array(score) return score
def main(): # read data from csv file data = pd.read_csv('headbrain.csv') print("data.shape = {}".format(data.shape)) # load data to x and y x = data['Head Size(cm^3)'].values y = data['Brain Weight(grams)'].values print(x.shape) beta = estimate_coefficients(x, y) # TEST predict(3000, beta) # END TEST # evaluate the model rmse(x, y, beta) r2_score(x, y, beta) plot_regression_line(x, y, beta, xlabel='Head Size in cm3', ylabel='Brain Weight in grams')
def test(): print("------------test------------") test_mae = 0 test_rmse = 0 net.eval() with torch.no_grad(): for i, data in enumerate(test_loader): # t = time() x, e, y = data output = net(x, adj, e) # print(time() - t) test_mae += utils.mae(output, y) test_rmse += utils.rmse(output, y) print("mae:{:2f} , rmse:{:2f}".format(test_mae / (i + 1), test_rmse / (i + 1)))
def Error(truth, pred): scores = [] for i in range(truth.shape[1]): err = utils.rmse(y_obs=truth[:, i], y_hat=pred[:, i]) scores.append(err) s = 0 for row in range(truth.shape[0]): for col in range(truth.shape[1]): s += (truth[row, col] - pred[row, col])**2 score = np.sqrt(s / (truth.shape[0] * truth.shape[1])) score = np.round(score[0], 6) return score, scores
def distance(ovitrap_eggs_i,lwO): if(sys.argv[2]==RMSE): return utils.rmse(ovitrap_eggs_i[ovitrap_eggs_i!=[None]], lwO[ovitrap_eggs_i!=[None]]) elif(sys.argv[2]==D): return utils.D(ovitrap_eggs_i[ovitrap_eggs_i!=[None]], lwO[ovitrap_eggs_i!=[None]]) elif(sys.argv[2] in [FRECHET,DTW]): ovitrap_eggs_i=np.array(ovitrap_eggs_i,dtype=np.float)#this change None for np.nan valid_ovi_idx=~np.isnan(ovitrap_eggs_i) reversed_valid_ovi_idx=valid_ovi_idx[::-1] first,last=np.argmax(valid_ovi_idx), len(reversed_valid_ovi_idx)-np.argmax(reversed_valid_ovi_idx)-1 x=np.array([[time_range[idx],lwO[idx]] for idx in range(first,last)]) y=np.array([ [time_range[idx],ovitrap_eggs_i[idx] ] for idx,isValid in enumerate(valid_ovi_idx) if isValid]) if(sys.argv[2]==FRECHET): return sm.frechet_dist(x,y) if(sys.argv[2]==DTW): return sm.dtw(x,y)[0] else: print('Metric %s not found'%sys.argv[2]) quit()
def predict_step(self, batch, _batch_nb): out = {"params": None, "pred_emiss": None, "pred_loss": None} # If step data, there's no corresponding laser params try: (y, ) = batch # y is emiss except ValueError: (y, x, uids) = batch # y is emiss,x is laser_params out["true_params"] = x out["uids"] = uids out["true_emiss"] = y x_pred = self(y) out["params"] = x_pred if self.forward_model is not None: y_pred = self.forward_model(x_pred) out["pred_emiss"] = y_pred y_loss = rmse(y_pred, y) out["pred_loss"] = y_loss loss = y_loss return out
def train(self, data): """ Train the internal models to predict the behavior of each sensor It is important to ensure that the training data is taken from models that are correct. """ # Read in the data cross_validate = False # Choose model parameter search space print "Training {0} sensors with {1} rows".format( data.shape[1], data.shape[0]) for sensor in range(data.shape[1]): print "Training model for sensor {0}".format(sensor) X, Y = utils.split_xy(data, sensor) if self.cross_validate: svr = SVR() clf = GridSearchCV(svr, self.cvparams, verbose=3) model = clf.fit(X, Y) print "Best model params for sensor {0}:".format(sensor) print model.best_params_ else: C = self.defaults["C"] kernel = self.defaults["kernel"] gamma = self.defaults["gamma"] clf = SVR(C=C, kernel=kernel, gamma=gamma) model = clf.fit(X, Y) # Check the training RMSE to ensure we are on track print "Testing <sensor={0}> model with {1} training rows".format( sensor, data.shape[0]) Yhat = model.predict(X) rmse = utils.rmse(Yhat, Y) self.models[sensor] = model print "RMSE for <sensor={0}> on training data is {1}".format( sensor, rmse)
def _exp(num_episodes, estimators, data_name, verbose=0): # Loop for a number of experiments on the single dataset rmse_buffer, reward_est_buffer = list(), list() for episode in range(num_episodes): print("=== {} ===".format(data_name)) reward_est, reward_true = single_run(estimators=estimators, data_name=data_name) _rmse = { key: rmse(a=np.mean(value), b=reward_true) for key, value in reward_est.items() } rmse_buffer.append(_rmse) reward_est_buffer.append(reward_est) """ Compute overall Bias and RMSE """ # aggregate all the results over all the epochs _bias_ = aggregator(buffer=reward_est_buffer) _rmse_ = aggregator(buffer=rmse_buffer) # run one more experiment to compute the bias reward_est, _ = single_run(estimators=estimators, data_name=data_name) dict_bias = { key: np.mean((value / num_episodes) - _value) for (key, value), (_, _value) in zip(_bias_.items(), reward_est.items()) } dict_rmse = {key: value / num_episodes for key, value in _rmse_.items()} if verbose: for (key, value_bias), (_, value_rmse) in zip(dict_bias.items(), dict_rmse.items()): print("[{}: {}] RMSE over {}-run: {}".format( data_name, key, num_episodes, value_rmse)) print("[{}: {}] Bias over {}-run: {}".format( data_name, key, num_episodes, value_bias)) return dict_bias, dict_rmse
for row in test_df.itertuples(): user, item, actual = row[1]-1, row[2]-1, row[3] predictions_baseline.append(pre.predict_baseline(user, item)) predictions_itemCF.append(pre.predict_itemCF(user, item)) predictions_userCF.append(pre.predict_userCF(user, item)) predictions_itemCF_baseline.append(pre.predict_itemCF_baseline(user, item)) predictions_userCF_baseline.append(pre.predict_userCF_baseline(user, item)) predictions_itemCF_bias.append(pre.predict_itemCF_bias(user, item)) predictions_topkCF_item.append(pre.predict_topkCF_item(user, item, 20)) predictions_topkCF_user.append(pre.predict_topkCF_user(user, item, 30)) predictions_normCF_item.append(pre.predict_normCF_item(user, item, 20)) predictions_normCF_user.append(pre.predict_normCF_user(user, item, 30)) predictions_blend.append(pre.predict_blend(user, item, 20, 30, 0.7)) targets.append(actual) rmse_baseline.append(utils.rmse(np.array(predictions_baseline), np.array(targets))) rmse_itemCF.append(utils.rmse(np.array(predictions_itemCF), np.array(targets))) rmse_userCF.append(utils.rmse(np.array(predictions_userCF), np.array(targets))) rmse_itemCF_baseline.append(utils.rmse(np.array(predictions_itemCF_baseline), np.array(targets))) rmse_userCF_baseline.append(utils.rmse(np.array(predictions_userCF_baseline), np.array(targets))) rmse_itemCF_bias.append(utils.rmse(np.array(predictions_itemCF_bias), np.array(targets))) rmse_topkCF_item.append(utils.rmse(np.array(predictions_topkCF_item), np.array(targets))) rmse_topkCF_user.append(utils.rmse(np.array(predictions_topkCF_user), np.array(targets))) rmse_normCF_item.append(utils.rmse(np.array(predictions_normCF_item), np.array(targets))) rmse_normCF_user.append(utils.rmse(np.array(predictions_normCF_user), np.array(targets))) rmse_blend.append(utils.rmse(np.array(predictions_blend), np.array(targets))) print('测试完成') print('------ 测试结果 ------') print('各方法在交叉验证下的RMSE值:') print('baseline: %.4f' % np.mean(rmse_baseline)) print('itemCF: %.4f' % np.mean(rmse_itemCF))
def kfold_lightgbm(train_df, test_df, num_folds): print('Starting LightGBM. Train shape: {}'.format(train_df.shape)) # Cross validation folds = GroupKFold(n_splits=num_folds) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] group = train_df['month'].astype(str) + '_' + train_df['year'].astype(str) # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], groups=group)): # split train/valid train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'demand'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'demand'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) params = { # 'device' : 'gpu', # 'gpu_use_dp':True, 'boosting': 'gbdt', 'metric': ['rmse'], 'objective': 'tweedie', 'learning_rate': 0.05, 'tweedie_variance_power': 1.1, 'subsample': 0.5, 'subsample_freq': 1, 'num_leaves': 2**8 - 1, 'min_data_in_leaf': 2**8 - 1, 'feature_fraction': 0.8, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold), 'num_threads': -1 } # train model reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model(f'../output/lgbm_group_k_fold_21days_{n_fold}.txt') # save predictions oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) sub_preds += reg.predict( test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits # save feature importances fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = feats fold_importance_df['importance'] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df['fold'] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # display importances display_importances( feature_importance_df, '../imp/lgbm_importances_group_k_fold_21days.png', '../imp/feature_importance_lgbm_group_k_fold_21days.csv') # Full RMSE score and LINE Notify full_rmse = rmse(train_df['demand'], oof_preds) line_notify('Full RMSE score %.6f' % full_rmse) # save out of fold prediction train_df.loc[:, 'demand'] = oof_preds train_df = train_df.reset_index() train_df[['id', 'd', 'demand']].to_csv(oof_file_name, index=False) # reshape prediction for submit test_df.loc[:, 'demand'] = sub_preds test_df = test_df.reset_index() preds = test_df[['id', 'd', 'demand']].reset_index() # save csv preds.to_csv(submission_file_name, index=False) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
# desvio padrao). Selecione aleatoriamente 75% dos dados para treinamento. # Retorne a estrutura da arvore construida. nclasses = np.union1d(y, y).size n = len(y) randind = np.arange(0, n) np.random.shuffle(randind) ind_train = randind[0:0.75 * n] ind_test = randind[0.75 * n:n] tree = RegressionTree(nclasses) tree.train(x[ind_train, :], y[ind_train], SDRMIN=0.1, NMIN=3) g, pos = tree.gerar_grafo() utils.draw_graph(g, pos) # b) Use os restantes 25% dos dados para avaliacao. Retorne as medidas MAPE e # RMSE. yhat = tree.estimate(x[ind_test, :]) rmse = utils.rmse(y[ind_test], yhat) mape = utils.mape(y[ind_test], yhat) print 'RMSE encontrado: {:3.2f}\nMAPE encontrado: {:3.2f}'.format(rmse,mape) plt.plot(y[ind_test]) plt.hold(True) plt.plot(yhat) plt.legend(['real','estimado']) plt.show() # c) Tente obter as regras de decisao a partir da arvore construida.
plt.title('Polinomio original') plt.ylabel('y') plt.xlabel('x') plt.hold(True) plt.plot(x[n / 2:, :], yhat) plt.savefig('./bases/results/polinomio_estimado') plt.clf() print 'Letra A' print 'Polinomio encontrado: ' print 'y = {:3.3f} + {:3.3f}x {: 3.3f}x^2\n'.format(what[0][0], what[1][0], what[2][0]) # b) Obtenha o RMSE e MAPE do modelo obtido sobre os dados da segunda metade dos # dados; print 'Letra B' rmse = utils.rmse(y[n / 2:, :], yhat) print 'RMSE = ' + str(rmse) + '\n' mape = utils.mape(y[n / 2:, :], yhat) print 'MAPE = ' + str(mape) + '\n' # c) Estimar o modelo que melhor se ajusta aos dados usando todos os dados. # Informe os parametros do modelo encontrado. Use os fatores de determinacao de # complexidade do modelo para auxiliar a encontrar o modelo. Obtenha o RMSE e MAPE # do modelo obtido sobre os dados. print 'Letra C' MAXDEGREE = 5 plt.Figure plt.hold(True) plt.grid(True) plt.plot(x, y) plt.title('Ajuste Polinomial')
y_pred_gb = np.zeros(y.shape) for trn_idx, val_idx in KFold(X.shape[0], n_folds=5): # split training data X_trn, X_tst, y_trn, y_tst = X[trn_idx,:], X[val_idx,:], y[trn_idx], y[val_idx] # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=50, n_jobs=4, random_state=23) clf.fit(X_trn, y_trn) y_pred_rf[val_idx] = clf.predict(X_tst) # or the Gradient Boosting Regressor clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=23) clf.fit(X_trn, y_trn) y_pred_gb[val_idx] = clf.predict(X_tst) print(' Score RFR/GBR: %.4f, %.4f' % (rmse(y_tst, y_pred_rf[val_idx]), rmse(y_tst, y_pred_gb[val_idx]))) # save prediction result to file err_rf = rmse(y, y_pred_rf) err_gb = rmse(y, y_pred_gb) id_ = filename.replace('train_pp_','').replace('.csv','') res[id_] = {'size':X.shape[0], 'd_th':th1, 'rf':err_rf, 'gb':err_gb} print('Total Score: %.4f, %.4f' % (err_rf, err_gb)) with open('training_result_TVT.pkl', 'wb') as fp: pickle.dump(res, fp, -1)