def best_split_lin_reg_dynamic(x, y): sort_i = np.argsort(x) n = len(y) xm = x[sort_i] - np.mean(x) ym = y[sort_i] - np.mean(y) xy_sum_false = 0 x2_sum_false = 0 xy_sum_true = np.sum(xm * ym) x2_sum_true = np.sum(xm ** 2) node_betta = xy_sum_true / x2_sum_true node_betta = 1 if np.isnan(node_betta) else node_betta node_score = mse(ym + np.mean(y), node_betta * xm) best_score = np.inf split_value = xm[0] + np.mean(x) split_ind = x[sort_i[0]] for i in range(1, n): xy_sum_false += xm[i] * ym[i] x2_sum_false += xm[i] ** 2 xy_sum_true -= xm[i] * ym[i] x2_sum_true -= xm[i] ** 2 print xm[i] + np.mean(x) false_betta = xy_sum_false / x2_sum_false # false_betta = 0 if np.isnan(false_betta) else false_betta false_ratio = i / float(n) false_score = mse(ym[:i] + np.mean(y), false_betta * xm[:i]) if len(ym[:i]) else 0 true_betta = xy_sum_true / x2_sum_true # true_betta = 0 if np.isnan(true_betta) else true_betta true_ratio = (n - i) / float(n) true_score = mse(ym[i:] + np.mean(y), true_betta * xm[i:]) score = node_score - (false_ratio * false_score + true_ratio * true_score) scores = np.array([false_score, true_score]) score = scores[np.argmin(scores)] if score < best_score: best_score = score split_value = x[sort_i[i]] split_ind = i return sort_i[:split_ind], sort_i[split_ind:], split_value, best_score
def _calculate_mse(self, reconstruction_tof, original_tof, test_data): # reconstruction_tof = [i for j, i in enumerate(reconstruction_tof) if j % 5 == 0] # original_tof = [i for j, i in enumerate(original_tof) if j % 5 == 0] # test_data = [i for j, i in enumerate(test_data) if j % 5 == 0] # Dropping -1 in test_data together with corresponding tof test_data, reconstruction_tof, original_tof, _ = self._remove_unobserved_data( test_data, reconstruction_tof, original_tof ) mse_recon = -1 mse_origin = -1 if len(reconstruction_tof) > 0 and len(original_tof) > 0: mse_recon = mse(test_data, reconstruction_tof) mse_origin = mse(test_data, original_tof) return mse_recon, mse_origin
def best_split_lin_reg(x_vect, y): node_lg = LinearRegression(n_jobs=NUM_CORES).fit(x_vect[:, np.newaxis], y) node_score = mse(y, node_lg.predict(x_vect[:, np.newaxis])) best_score = -np.inf best_split_value = None best_true_inds = None best_false_inds = None for split_value in np.unique(x_vect): true_inds = x_vect > split_value true_ratio = np.sum(true_inds) / float(len(y)) true_score = ling_reg_score(true_inds, x_vect, y) false_inds = np.invert(true_inds) false_ratio = 1 - true_ratio false_score = ling_reg_score(false_inds, x_vect, y) score = node_score - (true_ratio * true_score + false_ratio * false_score) if score > best_score: best_score = score best_split_value = split_value best_true_inds = true_inds best_false_inds = false_inds return best_false_inds, best_true_inds, best_split_value, best_score
def fit_and_predict(clf, rgr, X, y_clf, y_rgr, train, test, out_folder, fold): clf_model = clf.fit(X[train], y_clf[train]) y_clf_true = y_clf[test] y_rgr_true = y_rgr[test] y_clf_pred = clf_model.predict(X[test]) class_scores = np.array(precision_recall_fscore_support(y_clf_true, y_clf_pred)) micro_f1 = f1_score(y_clf_true, y_clf_pred, average='micro') macro_f1 = f1_score(y_clf_true, y_clf_pred, average='macro') rgr_model = rgr.fit(X[train], y_rgr[train]) y_rgr_pred = rgr_model.predict(X[test]) general_r2 = r2_score(y_rgr_true, y_rgr_pred) mse_score = mse(y_rgr_true, y_rgr_pred) mrse_score = mrse(y_rgr_true, y_rgr_pred) clf_pred_fpath = os.path.join(out_folder, '%d-clf.pred' % fold) clf_true_fpath = os.path.join(out_folder, '%d-clf.true' % fold) rgr_pred_fpath = os.path.join(out_folder, '%d-rgr.pred' % fold) rgr_true_fpath = os.path.join(out_folder, '%d-rgr.true' % fold) np.savetxt(clf_pred_fpath, y_clf_pred, fmt="%d") np.savetxt(clf_true_fpath, y_clf_true, fmt="%d") np.savetxt(rgr_pred_fpath, y_rgr_pred) np.savetxt(rgr_true_fpath, y_rgr_true) return class_scores, micro_f1, macro_f1, general_r2, mse_score,\ mrse_score
def random_forest(X_train, y_train, y_test, X_test, num_trees=100): model = RandomForestRegressor(n_estimators=num_trees, oob_score=True) model.fit(X_train, y_train) prediction = model.predict(X_test) mean_squared_error = mse(y_test, model.predict(X_test)) r2 = model.score(X_test, y_test) return (mean_squared_error, r2)
def eval_sts(ycat, y, name): """ Evaluate given STS regression-classification predictions and print results. """ ypred = loader.sts_categorical2labels(ycat) pr = pearsonr(ypred, y)[0] print('%s Pearson: %f' % (name, pr,)) print('%s Spearman: %f' % (name, spearmanr(ypred, y)[0],)) print('%s MSE: %f' % (name, mse(ypred, y),)) return pr
def ling_reg_score(indices, x, y): if not np.any(indices): return 1 leaf_x = x[indices][:, np.newaxis] leaf_y = y[indices] lg = LinearRegression(n_jobs=NUM_CORES).fit(leaf_x, leaf_y) score = mse(leaf_y, lg.predict(leaf_x)) return score
def linear_regression(model, features, target, y_test, X_test): model.fit(features, target) intercept = model.intercept_ coef = np.hstack([intercept, model.coef_]) prediction = model.predict(X_test) mean_squared_error = mse(y_test, model.predict(X_test)) r2 = model.score(X_test, y_test) return (mean_squared_error, r2)
def fit_select_best(X, y): """ Selects the best fit of the estimators already implemented by choosing the model with the smallest mean square error metric for the trained values. """ models = [fit(X,y) for fit in [fit_linear, fit_quadratic]] errors = map(lambda model: mse(y, model.predict(X)), models) return min(zip(models, errors), key=itemgetter(1))[0]
def regress(attributes, targets, model): # Split data into 'test' and 'train' for cross validation splits = cv.train_test_split(attributes, targets, test_size=0.2) X_train, X_test, y_train, y_test = splits model.fit(X_train, y_train) y_true = y_test y_pred = model.predict(X_test) print("Mean squared error = {:0.3f}".format(mse(y_true, y_pred))) print("R2 score = {:0.3f}".format(r2_score(y_true, y_pred)))
def linear(model, features, target, y_test, X_test): model.fit(features, target) intercept = model.intercept_ coef = np.hstack([intercept, model.coef_]) if X_test is not None: prediction = model.predict(X_test) mean_squared_error = mse(y_test, model.predict(X_test)) r2 = model.score(X_test, y_test) return mean_squared_error, r2 else: return coef
def validate_k_fold(df, poly_degree, n_folds=5, trainer=train_model): features, prices = get_regression_ctx(df, poly_degree) mse_scores = [] for train_idx, val_idx in KFold(len(df), n_folds=n_folds): features_train, features_val = features[train_idx], features[val_idx] prices_train, prices_val = prices[train_idx], prices[val_idx] model = trainer(features_train, prices_train) mse_scores.append(mse(prices_val, model.predict(features_val))) return pd.Series(mse_scores).mean()
def main(): # get the processed data X,y = preprocess_data() # get the dummy clf: Very important, it creates a baseline! dummy_clf = get_dummy_clf() dummy_clf.fit(X, y) y_hat = dummy_clf.predict(y) # Get the baseline predictions for x and y print "Dummy MSE x", mse(y[:,0], y_hat[:,0]) print "Dummy MSE y", mse(y[:,1], y_hat[:,1]) # create 5 different crossvalidation folds ss = ShuffleSplit(len(y), n_iter=5, random_state=0) scores_x = [] scores_y = [] for i, (train_index, test_index) in enumerate(ss): # Choose a classifier #clf = get_linear_clf() clf = get_nn_clf() clf.fit(X[train_index], y[train_index]) y_hat = clf.predict(X[test_index]) # Save the score for each fold score_x = mse(y[test_index,0], y_hat[:,0]) score_y = mse(y[test_index,1], y_hat[:,1]) # You can print the coefficients/intercept for the linear classifier #print clf.steps[-1][1].coef_,clf.steps[-1][1].intercept_ scores_x.append(score_x) scores_y.append(score_y) print scores_x,scores_y print "MSE CV x", np.array(scores_x).mean() print "MSE CV y", np.array(scores_y).mean()
def calculate_mse(self, region, data, month, year): if self.spectrum_selection != 0: reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum( region, self.addition_technique, self.spectrum_selection ) else: reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum( region, self.addition_technique ) test_data = trajectories_full_dates_periodic( data, month, year, self.length_of_periodicity, self.window_interval, self.minute_interval ) reconstruction_tof = [i for j, i in enumerate(reconstruction_tof) if j % 5 == 0] original_tof = [i for j, i in enumerate(original_tof) if j % 5 == 0] test_data = [i for j, i in enumerate(test_data) if j % 5 == 0] # Dropping -1 in test_data together with corresponding tof test_data, reconstruction_tof, original_tof, _ = self._remove_unobserved_data( test_data, reconstruction_tof, original_tof ) mse_recon = -1 mse_origin = -1 if len(reconstruction_tof) > 0 and len(original_tof) > 0: mse_recon = mse(test_data, reconstruction_tof) mse_origin = mse(test_data, original_tof) rospy.loginfo("Calculated MSE for original tof Region %s: %.2f" % (region, mse_origin)) rospy.loginfo("Calculated MSE for reconstruction tof Region %s: %.2f" % (region, mse_recon)) # temp_recon = np.sqrt(mse_recon) # sum_recon = 0 # for i in test_data: # sum_recon += (i - temp_recon)**2 # print "std_dev: %f" % (np.sqrt(sum_recon / float(len(test_data) - 1))) # temp_recon = np.sqrt(mse_origin) # sum_recon = 0 # for i in test_data: # sum_recon += (i - temp_recon)**2 # print "std_dev: %f" % (np.sqrt(sum_recon / float(len(test_data) - 1))) return mse_recon, mse_origin
def evaluate(model, seed=1234, evaltest=False): """ Run experiment """ print 'Preparing data...' train, dev, test, scores = load_data() train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed) print 'Computing training skipthoughts...' trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True) trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True) print 'Computing development skipthoughts...' devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True) devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True) print 'Computing feature combinations...' trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] devF = np.c_[np.abs(devA - devB), devA * devB] print 'Encoding labels...' trainY = encode_labels(scores[0]) devY = encode_labels(scores[1]) print 'Compiling model...' lrmodel = prepare_model(ninputs=trainF.shape[1]) print 'Training...' bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1]) if evaltest: print 'Computing test skipthoughts...' testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True) testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True) print 'Computing feature combinations...' testF = np.c_[np.abs(testA - testB), testA * testB] print 'Evaluating...' r = np.arange(1,6) yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r) pr = pearsonr(yhat, scores[2])[0] sr = spearmanr(yhat, scores[2])[0] se = mse(yhat, scores[2]) print 'Test Pearson: ' + str(pr) print 'Test Spearman: ' + str(sr) print 'Test MSE: ' + str(se) return yhat
def gradient(X_train, y_train, y_test, X_test, file_loc, target): ''' Passes to grid search function within this function to pick the best parameters for each gradient boosted model depending on the target variable we are trying to predict ''' grid = grid_search(file_loc, target) best_params = grid.best_params_ learn_rate = best_params['learning_rate'] n_estimators = best_params['n_estimators'] max_feat = best_params['max_features'] model = GradientBoostingRegressor(learning_rate=learn_rate, n_estimators=n_estimators, max_features=max_feat) model.fit(X_train, y_train) prediction = model.predict(X_test) mean_squared_error = mse(y_test, model.predict(X_test)) r2 = model.score(X_test, y_test) return (mean_squared_error, r2)
def run_grid_search(m, parameters, params, name, Xtrain, Ytrain, Xtest, Ytest): print('=' * 80) print("Training %s Model" % name) print('=' * 80) t0 = time() clf = RandomizedSearchCV(m, parameters, cv=3, n_jobs=4, verbose=3, error_score=0) clf.fit(Xtrain, Ytrain) Yhat = clf.predict(Xtest) print("\tDone in %1.2f seconds" % float(time() - t0)) print("\tScore: %1.2f\n" % mse(Yhat, Ytest)) print("Best Parameters" + str(clf.best_params_)) print("Writing Solution") submit = pd.DataFrame(data={'id': ids, 'quality': Yhat}) submit.to_csv('./submissions/'+name+'.csv', index = False)
def eval_sts(ycat, y, name, quiet=False): """ Evaluate given STS regression-classification predictions and print results. """ if ycat.ndim == 1: ypred = ycat else: ypred = loader.sts_categorical2labels(ycat) if y.ndim == 1: ygold = y else: ygold = loader.sts_categorical2labels(y) pr = pearsonr(ypred, ygold)[0] if not quiet: print('%s Pearson: %f' % (name, pr,)) print('%s Spearman: %f' % (name, spearmanr(ypred, ygold)[0],)) print('%s MSE: %f' % (name, mse(ypred, ygold),)) return pr
def _frame_indexing(image_idx, prob, name): image_matches = {} image_match = None if image_idx == -1: frame_prob = prob image_match = image_matches[name] = {} else: frame_prob = _map_frame_prob(vd_df['prob'][image_idx]) image_match = image_matches[vd_df['frame'][image_idx]] = {} for i in range(0, len(vd_df) - 1): mse_prob = mse(frame_prob, _map_frame_prob(vd_df['prob'][i + 1])) image_match[mse_prob] = {} image_match[mse_prob]['img'] = vd_df['frame'][i + 1] return image_matches
def prediction_accuracy(self, region, data, month, year, percentile=0.1, plot=False): if self.spectrum_selection != 0: reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum( region, self.addition_technique, self.spectrum_selection ) else: reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum( region, self.addition_technique # region, False, 26 ) test_data = trajectories_full_dates_periodic( data, month, year, self.length_of_periodicity, self.window_interval, self.minute_interval ) original_predict = self._get_prediction(original_tof, test_data, percentile) reconstr_predict = self._get_prediction(reconstruction_tof, test_data, percentile) _, clean_ori_pred, clean_recon_pred, indices = self._remove_unobserved_data( test_data, reconstr_predict, original_predict ) if len(clean_ori_pred) and len(clean_recon_pred): mse_predict = mse(clean_ori_pred, clean_recon_pred) else: mse_predict = -1 rospy.loginfo( "Calculated MSE for prediction between original and reconstruction for Region %s: %.2f" % (region, mse_predict) ) if plot: for index in indices: original_predict[index] = -1 reconstr_predict[index] = -1 x = np.linspace(0, len(test_data), len(test_data)) xticks = time_ticks(self.minute_interval, self.window_interval, self.periodic_type) plt.plot( x, original_predict, "-o", label="Prediction Original TOF" ) plt.plot( x, reconstr_predict, "-^", label="Prediction Reconstruction TOF" ) plt.title("Prediction for Region %s" % region) plt.xticks(x, xticks, rotation="vertical") plt.xlabel("One Week Period with %d minutes interval and %d window time" % (self.minute_interval, self.window_interval)) plt.ylabel("Prediction (1=Anomalous, 0=Normal, -1=Unobserved)") plt.ylim(ymin=-2, ymax=2) plt.legend() plt.show()
def on_epoch_end(self, epoch, logs={}): self.train_losses.append(mse(self.y_train, self.model.predict(self.X_train))) self.val_losses.append(logs.get('val_loss')) if self.score_func == 'accuracy': true_train = np_utils.probas_to_classes(self.y_train) pred_train = np_utils.probas_to_classes(self.model.predict(self.X_train)) self.add_train_scores.append(accuracy_score(true_train, pred_train)) true_test = np_utils.probas_to_classes(self.y_test) pred_test = np_utils.probas_to_classes(self.model.predict(self.X_test)) val_score = accuracy_score(true_test, pred_test) self.add_val_scores.append(val_score) elif self.score_func == 'r2_score': val_score = r2_score(self.y_test, self.model.predict(self.X_test)) self.add_val_scores.append(val_score) self.add_train_scores.append(r2_score(self.y_train, self.model.predict(self.X_train))) self.best_score = max(self.best_score, val_score) self.printCurrentStage(epoch)
def Result_Evaluation (outputpath, accuracy, testing_Labels, predict_Labels): acc_rate = [0, 0, 0, 0, 0] testingSamples = len(testing_Labels) if os.path.isfile(outputpath): os.remove(outputpath) with io.open(outputpath, 'a', encoding='utf-8') as output_file: for i in xrange(0, testingSamples): rounded_result = int(round(predict_Labels[i])) if rounded_result == testing_Labels[i]: acc_rate[0] += 1 result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> spot on!\n" output_file.write(unicode(result_item)) elif abs(rounded_result - testing_Labels[i])<=1: acc_rate[1] += 1 result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 1 star\n" output_file.write(unicode(result_item)) elif abs(rounded_result - testing_Labels[i])<=2: acc_rate[2] += 1 result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 2 star\n" output_file.write(unicode(result_item)) elif abs(rounded_result - testing_Labels[i])<=3: acc_rate[3] += 1 result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 3 star\n" output_file.write(unicode(result_item)) else: acc_rate[4] += 1 result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 4 star\n" output_file.write(unicode(result_item)) #output_file.write(unicode(additional_info)) finalResult = " #spot on: " + str(acc_rate[0]) + '\n' + " #off by 1 star: " + str(acc_rate[1]) + '\n' + " #off by 2 star: " + str(acc_rate[2]) + '\n' + " #off by 3 star: " + str(acc_rate[3]) + '\n' + " #off by 4 star: " + str(acc_rate[4]) + '\n' output_file.write(unicode(finalResult)) finalResultPercentage = " #spot on: " + str(acc_rate[0]*1.0/testingSamples) + '\n' + " #off by 1 star: " + str(acc_rate[1]*1.0/testingSamples) + '\n' + " #off by 2 star: " + str(acc_rate[2]*1.0/testingSamples) + '\n' + " #off by 3 star: " + str(acc_rate[3]*1.0/testingSamples) + '\n' + " #off by 4 star: " + str(acc_rate[4]*1.0/testingSamples) + '\n' output_file.write(unicode(finalResultPercentage)) print(" #Right: " + str(acc_rate[0]*1.0/testingSamples) + '\n') print(" #Wrong: " + str((acc_rate[2]+acc_rate[3]+acc_rate[4])*1.0/testingSamples) + '\n') r2Score = r2_score(testing_Labels, predict_Labels) print(" #R2 score: " + str(r2Score)) print (" #sqrt(mse): {:f}".format(np.sqrt(mse(testing_Labels, predict_Labels)))) print("Look at the evaluation_file for details!")
def evaluate(dsA, dsB, _scores): tA, tB, idsA, idsB, lengthsA, lengthsB = None, None, None, None, None, None e_off = 0 ps = np.zeros((len(dsA), 5)) op_weights_monitor = {w.name[-11:]:[] for w in op_weights} while e_off < len(dsA): tA, tB, idsA, idsB, lengthsA, lengthsB = batchify(dsA[e_off:e_off + batch_size], dsB[e_off:e_off + batch_size], vocab["<padding>"], tA, tB, idsA, idsB, lengthsA, lengthsB, max_length=max_l, max_batch_size=batch_size) size = min(len(dsA) - e_off, batch_size) allowed_conds = ["/cond_%d/" % (2*i) for i in range(min(np.min(lengthsA),np.min(lengthsB)))] current_weights = [w for w in op_weights if any(c in w.name for c in allowed_conds)] random.shuffle(current_weights) result = sess.run([model["probs"]] + current_weights[:10], feed_dict={model["inpA"]: tA[:,:size], model["inpB"]: tB[:,:size], model["idsA"]: idsA[:,:size], model["idsB"]: idsB[:,:size], model["lengthsA"]: lengthsA[:size], model["lengthsB"]: lengthsB[:size]}) ps[e_off:e_off+batch_size] = result[0] e_off += batch_size for probs, w in zip(result[1:], current_weights): op_weights_monitor[w.name[-11:]].extend(probs.tolist()) for k,v in op_weights_monitor.items(): hist, _ = np.histogram(np.array(v), bins=5,range=(0.0,1.0)) hist = (hist * 1000) / np.sum(hist) print(k, hist.tolist()) r = np.arange(1,6) yhat = np.dot(ps, r) pr = pearsonr(yhat, _scores)[0] sr = spearmanr(yhat, _scores)[0] se = mse(yhat, _scores) return pr, sr, se
def run_experiment(X, y_clf, y_rgr, feature_ids, out_foldpath, k=500): clf, rgr = create_learners() n = len(y_clf) train_index = np.ones(n, dtype=np.bool) train_index[-k:] = False test_index = np.logical_not(train_index) clf_model = clf.fit(X[train_index], y_clf[train_index]) rgr_model = rgr.fit(X[train_index], y_rgr[train_index]) clf_true = y_clf[test_index] clf_pred = clf_model.predict(X[test_index]) rgr_true = y_rgr[test_index] rgr_pred = rgr_model.predict(X[test_index]) clf_pred_fpath = os.path.join(out_foldpath, '%clf.pred') clf_true_fpath = os.path.join(out_foldpath, '%clf.true') rgr_pred_fpath = os.path.join(out_foldpath, '%rgr.pred') rgr_true_fpath = os.path.join(out_foldpath, '%rgr.true') np.savetxt(clf_pred_fpath, clf_pred, fmt="%d") np.savetxt(clf_true_fpath, clf_true, fmt="%d") np.savetxt(rgr_pred_fpath, rgr_pred) np.savetxt(rgr_true_fpath, rgr_true) print('Micro F1: ', f1_score(clf_true, clf_pred, average='micro')) print('Macro F1: ', f1_score(clf_true, clf_pred, average='macro')) print() print('R2: ', r2_score(rgr_true, rgr_pred)) print('MAE: ', mae(rgr_true, rgr_pred)) print('MSE: ', mse(rgr_true, rgr_pred)) print() print_importance(feature_ids, clf_model.best_estimator_.feature_importances_, rgr_model.best_estimator_.feature_importances_)
def evaluate(seed=1234, evaltest=False): """ Run experiment """ print 'Preparing data...' os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_factors) X = np.genfromtxt(os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_factors)) print(X.size) y = np.genfromtxt(os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_targets)) print(y.size) X_train, X_test, y_train, y_test = train_test_split(X, y) print 'Encoding labels...' trainY = encode_labels(y_train) devY = encode_labels(y_test) print 'Compiling model...' lrmodel = prepare_model(ninputs=len(X[0])) print 'Training...' bestlrmodel = train_model(lrmodel, X_train, trainY, X_test, devY, y_test) if evaltest: print 'Evaluating...' r = np.arange(1, 6) yhat = np.dot(bestlrmodel.predict_proba(X_test, verbose=2), r) pr = pearsonr(yhat, y_test)[0] sr = spearmanr(yhat, y_test)[0] se = mse(yhat, y_test) print 'Test Pearson: ' + str(pr) print 'Test Spearman: ' + str(sr) print 'Test MSE: ' + str(se) return yhat
def gd(train_X, train_y, test_X, test_y, learning_rate=0.0001, iterations=1000, lr_dampening=1.0, reg=0): [m, n] = train_X.shape w = np.asarray(np.hstack((100, train_X.mean())))[:, np.newaxis] #w = np.random.randn(n + 1, 1) train_X = np.concatenate((np.ones((m, 1)), train_X), axis=1) train_y = train_y.as_matrix() if train_y.shape == (1, m): train_y = train_y.T if train_y.shape != (m, 1): print('train_y shape sould be m rows, 1 column') loss_history = np.zeros( (iterations, 2)) # keeps track of MSE [train, test] residual_history = np.zeros( (iterations, 2)) # keeps track of MAE [train, test] #grad_history = np.ndarray(shape=(1,n+1)) # tracks the gradient itself (debugging) #w_history = w # tracks weights (debugging) for it in range(iterations): predictions = np.dot(train_X, w) loss = predictions - train_y loss_train = mse(train_y, predictions) residual_train = mae(train_y, predictions) predictions = w[0] + np.dot(test_X, w[1:]) loss_test = mse(test_y, predictions) residual_test = mae(test_y, predictions) loss_history[it] = [loss_train, loss_test] residual_history[it] = [residual_train, residual_test] grad = np.dot(train_X.T, loss) / m #grad_history = np.append(grad_history, grad.T, axis=0) if ((it + 1) % 100) == 0: print('it=%4d, loss=%.3f, residual=%.3f 1000*lr=%.12f' % (it + 1, loss_train, residual_train, 1000 * learning_rate)) if np.isnan(np.sum(grad)) or np.isinf(np.sum(grad)): print('NaN or Inf detected, stopping at it=' + str(it)) break # learning term regularization term w = w - (learning_rate / (1 + it)) * grad + reg * np.dot(w.ravel(), w.ravel()) learning_rate = lr_dampening * learning_rate #w_history = np.append(w_history, w) if np.isnan(np.sum(w)) or np.isinf(np.sum(w)): print('NaN or Inf detected after w update, stopping at it=' + str(it)) break return w.flatten( ), loss_history, residual_history #, grad_history #, w_history
data_dir_path = path.join('..', 'dataset', 'target', target) X_train, y_train, X_test, y_test = \ read_data_from_dataset(data_dir_path) period = (len(y_train) + len(y_test)) // 30 RPG = ReccurentPredictingGenerator(X_test, batch_size=1, timesteps=period) prediction = [] for path_model in tqdm(listdir(path.join(write_result_out_dir, 'model'))): file_path = path.join(write_result_out_dir, 'model', path_model) best_model = load_model(file_path) y_test_pred = best_model.predict_generator(RPG) prediction.append(y_test_pred) prediction = np.array(prediction) list_score = [] size_test = prediction.shape[1] y_test = y_test[-size_test:] for i_prediction in range(prediction.shape[0])[:1]: pred = np.mean(prediction[:i_prediction + 1], axis=0) accuracy = mse(y_test, pred.flatten()) list_score.append(accuracy) np.save('sru', prediction) plt.rcParams['font.size'] = 25 plt.figure(figsize=(15, 7)) plt.plot(list_score) plt.xlabel('the number of subsets / -') plt.ylabel('MSE / -') plt.savefig('bagging_sru')
#ANN from keras.models import Sequential from keras.layers import Dense model = Sequential() model.add( Dense(30, activation='relu', input_dim=len(x_train.T), kernel_initializer='normal')) model.add(Dense(30, activation='relu', kernel_initializer='normal')) model.add(Dense(20, activation='relu', kernel_initializer='normal')) model.add(Dense(10, activation='relu', kernel_initializer='normal')) model.add(Dense(1, kernel_initializer='normal')) model.compile(loss='mse', optimizer='adam') model.fit(x_train, y_train, epochs=70, batch_size=30) ypred = model.predict(x_test) ypred = np.ravel(ypred) #Inverse Transforming ypred = sc1.inverse_transform(ypred) y_test = sc1.inverse_transform(y_test) ydiff = y_test - ypred #Checking Data Validity. rmse = np.sqrt(mse(y_test, ypred)) print('\nRoot Mean Square: ', rmse)
# gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0) # gbrt.fit(X[:, None], y.ravel()) # 另外再多一組例子 from sklearn.model_selection import train_test_split as tts from sklearn.metrics import mean_squared_error as mse X = X[:, None] y = y.ravel() X_train, X_val, y_train, y_val = tts(X, y) gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120) gbrt.fit(X_train, y_train) errors = [mse(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)] bst_n_estimators = np.argmin(errors) gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators) gbrt_best.fit(X_train, y_train) y_pred = gbrt_best.predict(X_val) # 事實上, 要使用早期停止法不一定要訓練大量的樹然後再回頭找最優, 可以直接提早停止訓練(warm_start = True) # 以下的程式碼會在驗證誤差連續5次迭代未改善時, 會直接停止訓練 # gbrt_test = GradientBoostingRegressor(max_depth = 2, warm_start = True) # min_val_error = float("inf") # error_going_up = 0 # for n_estimators in range(1, 120): # gbrt_test.n_estimators = n_estimators
learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=175, shuffle=True, random_state=None, tol=0.000011, verbose=False, warm_start=False, momentum=0.5, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, # beta_1=0.9, # beta_2=0.999, # epsilon=1e-08 ) mlp.fit(Xt, Yt) YHat = mlp.predict(Xv) err = mse(YHat, Yv) print('size:', i, 'err', err) Xte, _ = ml.transforms.rescale(Xte, param) Yhat = mlp.predict(Xte) fh = open('pred_nnet%d.csv' % i, 'w') fh.write('ID,Prediction\n') for i, yi in enumerate(Yhat): fh.write('{},{}\n'.format(i, yi)) fh.close()
stack_trn = np.vstack([pred_trn_lgb['pred'], pred_trn_xgb['pred']]).transpose() stack_tst = np.vstack([pred_lgb['pred'], pred_xgb['pred']]).transpose() stack_folds = KFold(n_splits=4, random_state=19960101) stack_oof = np.zeros(stack_trn.shape[0]) pred_tst['pred'] = np.zeros(stack_tst.shape[0]) for _fold, (trn_idx, val_idx) in enumerate(stack_folds.split(stack_trn, df_trn['收率'])): trn_x, trn_y = stack_trn[trn_idx], df_trn['收率'].iloc[trn_idx].values val_x, val_y = stack_trn[val_idx], df_trn['收率'].iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_x, trn_y) stack_oof[val_idx] = clf_3.predict(val_x) pred_tst['pred'] += clf_3.predict(stack_tst) / 4 print('\nThe Bagging Loss', mse(df_trn['收率'].values, stack_oof)) del val_x, val_y, trn_x, trn_y, trn_idx, val_idx, cat_features del params_lgb, params_xgb, fit_params del pred_trn_xgb, pred_trn_lgb del _fold, clf_3, stack_oof, stack_folds, stack_trn, stack_tst del KFold, RepeatedKFold, BayesianRidge, trn, tst gc.collect() # pred_tst1 = pred_tst.copy() # pred_tst['pred'] = pred_tst1['pred']*0.5 + pred_tst2['pred']*0.5 pred_tst.to_csv(f'submit/submit_{datetime.now().strftime("%m%d%H%M")}.csv', index=False, header=None)
def normalizedRMSE(real,predicted): return np.sqrt(mse(real,predicted))/(np.concatenate((real,predicted)).max() - np.concatenate((real,predicted)).min())
def create_prophet_m(app_name,z1,cpu_perc_list,delay=24): ### --- For realtime pred ---### full_df = z1.bw.iloc[0:len(z1)] full_df = full_df.reset_index() full_df.columns = ['ds','y'] #removing outliers q50 = full_df.y.median() q100 = full_df.y.quantile(1) q75 = full_df.y.quantile(.75) if((q100-q50) >= (2*q50)): full_df.loc[full_df.y>=(2*q50),'y'] = None #-- Realtime prediction --## #model model_r = Prophet(yearly_seasonality=False,changepoint_prior_scale=.1,seasonality_prior_scale=0.05) model_r.fit(full_df) cpu_perc_list.append(py.cpu_percent()) cpu_perc_list = [max(cpu_perc_list)] future_r = model_r.make_future_dataframe(periods=delay,freq='D') forecast_r = model_r.predict(future_r) forecast_r.index = forecast_r['ds'] #forecast pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1)+delay)]) pred_r=pred_r.reset_index() #--- completes realtime pred ---# train_end_index=len(z1.bw)-delay train_df=z1.bw.iloc[0:train_end_index] test_df=z1.bw.iloc[train_end_index:len(z1)] train_df=train_df.reset_index() test_df=test_df.reset_index() train_df.columns=['ds','y'] #--- removing outliers in trainset ---# q50 = train_df.y.median() q100 = train_df.y.quantile(1) q75 = train_df.y.quantile(.75) if((q100-q50) >= (2*q50)): train_df.loc[train_df.y>=(2*q50),'y'] = None test_df.columns=['ds','y'] test_df['ds'] = pd.to_datetime(test_df['ds']) #model model = Prophet(yearly_seasonality=False,changepoint_prior_scale=.1,seasonality_prior_scale=0.05) model.fit(train_df) cpu_perc_list.append(py.cpu_percent()) cpu_perc_list = [max(cpu_perc_list)] future = model.make_future_dataframe(periods=len(test_df),freq='D') forecast = model.predict(future) forecast.index = forecast['ds'] #forecast pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)]) print('length forecasted non realtime=',len(pred)) pred=pred.reset_index() pred_df=pd.merge(test_df,pred,on='ds',how='left') pred_df.dropna(inplace=True) df=pd.DataFrame() if(len(pred_df)>0): pred_df['error_test']=pred_df.y-pred_df.yhat MSE=mse(pred_df.y,pred_df.yhat) RMSE=math.sqrt(MSE) pred_df['APE']=abs(pred_df.error_test*100/pred_df.y) MAPE=pred_df.APE.mean() min_error_rate = pred_df['APE'].quantile(0)/100 max_error_rate = pred_df['APE'].quantile(1)/100 median_error_rate = pred_df['APE'].quantile(.50)/100 print("App name:",app_name) #print("MSE :",MSE) print("RMSE :",RMSE) print("MAPE :",MAPE) mape_q98=pred_df['APE'][pred_df.APE<pred_df['APE'].quantile(0.98)].mean() std_MAPE = math.sqrt(((pred_df.APE-MAPE)**2).mean()) df = pd.DataFrame({'length':len(z1), 'test_rmse':RMSE, 'test_mape':MAPE, 'std_mape':std_MAPE, #standerd deviation of mape 'min_error_rate':min_error_rate , 'max_error_rate':max_error_rate , 'median_error_rate':median_error_rate, 'test_mape_98':mape_q98}, index=[app_name]) return(df,model,forecast,pred_df,pred_r)
# Load the dataset from sklearn.datasets import load_linnerud linnerud_data = load_linnerud() X = linnerud_data.data y = linnerud_data.target from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error as mse from sklearn.linear_model import LinearRegression from sklearn import cross_validation # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.4, random_state=0) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) print "Decision Tree mean absolute error: {:.2f}".format( mse(reg.predict(X_test), y_test)) reg = LinearRegression() reg.fit(X_train, y_train) print "Linear regression mean absolute error: {:.2f}".format( mse(reg.predict(X_test), y_test)) results = {"Linear Regression": 0, "Decision Tree": 0}
XvCV.append(X[va_idx]) YtCV.append(Y[tr_idx]) YvCV.append(Y[va_idx]) errTD = [] errVD = [] D = list(range(5, 60, 5)) for d in D: errti = [] errvi = [] for i in range(5): rfr = RFR(n_estimators=50, max_depth=d) rfr.fit(XtCV[0], YtCV[0]) YtHat = rfr.predict(XtCV[0]) YvHat = rfr.predict(XvCV[0]) errti.append(mse(YtCV[0], YtHat)) errvi.append(mse(YvCV[0], YvHat)) errti = np.array(errti) errvi = np.array(errvi) errTD.append(np.mean(errti)) errVD.append(np.mean(errvi)) #%% plt.plot(D, errTD, '*-', label='Train Err') plt.plot(D, errVD, '*-', label='Valid Err') plt.legend() plt.title('RandomForest Err vs MaxDepth') plt.xticks(D, D) plt.xlabel('depth') plt.ylabel('err') plt.savefig('rf_depth', dpi=2000)
def root_mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'): rmse = np.sqrt(mse(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput)) return rmse
def train(examples, labels, features=None, bucket_sizes=None, crosses=None, lr=1e-4, steps=100, batch_size=1, model=None): '''Create and train a linear regression model. Args: examples: pandas.DataFrame with examples labels: pandas.DataFrame with labels features: list of selected features from examples bucket_sizes: dict with size of buckets crosses: list of lists of features to be crossed lr: float, learning rate steps: int, number of steps to train batch_size: int, number of examples per batch model: tensorflow.estimator.LinearRegressor, previously trained model Returns: A trained tensorflow.estimator.LinearRegressor. ''' # Create feature columns and dictionary mapping feature names to them. if not features: features = examples.columns fcdict = { feature: tf.feature_column.numeric_column(feature) for feature in features } fcs = fcdict.values() # Use buckets if bucket_sizes is specified. if bucket_sizes: if len(bucket_sizes) != len(features): raise ValueError( 'The number of buckets must match the number of features.') fcdict = { feature: bucketize(examples[feature], fc, bucket_sizes[feature]) if bucket_sizes[feature] else fc for feature, fc in fcdict.items() } fcs = fcdict.values() # Use crossed columns if crosses is specified. if crosses: for cross in crosses: cross_name = '_x_'.join(cross) cross_fc = [fcdict[feature] for feature in cross] fcdict[cross_name] = tf.feature_column.crossed_column( cross_fc, 1000) fcs = fcdict.values() ds = Ds.from_tensor_slices( ({feature: examples[feature] for feature in features}, labels)) opt = tf.contrib.estimator.clip_gradients_by_norm( tf.train.FtrlOptimizer(learning_rate=lr), 5.0) if not model: model = tf.estimator.LinearRegressor(fcs, optimizer=opt) for _ in range(10): model.train(train_fn(ds, batch_size=batch_size), steps=steps // 10) predictions = get_predictions(model, ds) print("Mean squared error: ", mse(predictions, labels)) return model
def rmse(y_test,y_predict): return np.sqrt(mse(y_test,y_predict))
def create_prophet_m(app_name, z1, delay=24): ### --- For realtime pred ---### full_df = z1.bw.iloc[0:len(z1)] full_df = full_df.reset_index() full_df.columns = ['ds', 'y'] #removing outliers q50 = full_df.y.median() q100 = full_df.y.quantile(1) q75 = full_df.y.quantile(.75) if ((q100 - q50) >= (2 * q75)): full_df.loc[full_df.y >= (2 * q75), 'y'] = None #-- Realtime prediction --## #model model_r = Prophet(yearly_seasonality=False, changepoint_prior_scale=.2) model_r.fit(full_df) future_r = model_r.make_future_dataframe(periods=delay, freq='H') forecast_r = model_r.predict(future_r) forecast_r.index = forecast_r['ds'] #forecast pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1) + delay)]) pred_r = pred_r.reset_index() #--- completes realtime pred ---# train_end_index = len(z1.bw) - delay train_df = z1.bw.iloc[0:train_end_index] test_df = z1.bw.iloc[train_end_index:len(z1)] train_df = train_df.reset_index() test_df = test_df.reset_index() train_df.columns = ['ds', 'y'] #--- removing outliers in trainset ---# q50 = train_df.y.median() q100 = train_df.y.quantile(1) q75 = train_df.y.quantile(.75) if ((q100 - q50) >= (2 * q75)): train_df.loc[train_df.y >= (2 * q75), 'y'] = None test_df.columns = ['ds', 'y'] #model model = Prophet(yearly_seasonality=False, changepoint_prior_scale=.2) model.fit(train_df) future = model.make_future_dataframe(periods=len(test_df), freq='H') forecast = model.predict(future) forecast.index = forecast['ds'] #forecast pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)]) pred = pred.reset_index() pred_df = pd.merge(test_df, pred, on='ds', how='left') pred_df.dropna(inplace=True) df = pd.DataFrame() if (len(pred_df) > 0): pred_df['error_test'] = pred_df.y - pred_df.yhat MSE = mse(pred_df.y, pred_df.yhat) RMSE = math.sqrt(MSE) pred_df['APE'] = abs(pred_df.error_test * 100 / pred_df.y) MAPE = pred_df.APE.mean() min_error_rate = pred_df.quantile(0) / 100 max_error_rate = pred_df.quantile(1) / 100 median_error_rate = pred_df.quantile(.50) / 100 print("App name:", app_name) #print("MSE :",MSE) print("RMSE :", RMSE) print("MAPE :", MAPE) mape_q98 = pred_df['APE'][ pred_df.APE < pred_df['APE'].quantile(0.98)].mean() std_MAPE = math.sqrt(((pred_df.APE - MAPE)**2).mean()) df = pd.DataFrame( { 'length': len(z1), 'test_rmse': RMSE, 'test_mape': MAPE, 'std_mape': std_MAPE, #standerd deviation of mape 'min_error_rate': min_error_rate, 'max_error_rate': max_error_rate, 'median_error_rate': median_error_rate, 'test_mape_98': mape_q98 }, index=[app_name]) return (df, model, forecast, pred_df, pred_r)
YvCV.append(Y0[va_idx]) #%% KNN method K = [2**k for k in range(2, 13)] errt = [] errv = [] for k in K: print("k=", k) errtk = 0 errvk = 0 knnL = neighbors.KNeighborsRegressor(k) for i in range(10): Xt, Yt = XtCV[i], YtCV[i] Xv, Yv = XvCV[i], YvCV[i] knnL.fit(Xt, Yt) errvi = mse(knnL.predict(Xv), Yv) errti = mse(knnL.predict(Xt), Yt) print("errt: %.5f\t" % errti, "errv: %.5f" % errvi) errtk += errti errvk += errvi errtk /= 10 errvk /= 10 errv.append(errvk) errt.append(errtk) #%% KNN plot plt.semilogx(K, np.array(errt) * 2, "*-", label="Train Err") plt.semilogx(K, np.array(errv) * 2, "*-", label="Valid Err") plt.xticks(K, K) plt.title("KNN Err vs K") plt.xlabel("k")
print("shuffling data...") examples = list(zip(X, y)) X, y = list(zip(*examples)) X = np.array(X) y = np.array(y) del examples kf = KFold(n=len(X), n_folds=5, shuffle=True, random_state=np.random) train_index, test_index = next(iter(kf)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] del X del y print("fitting model...") mlp.fit(X_train, y_train) print("scoring model...") # print("predicted:", mlp.predict(X_test)) # print("actual:", y_test) print("R^2 score =", mlp.score(X_test, y_test)) y_pred = mlp.predict(X_test) print("MSE score =", mse(y_pred, y_test)) print("MAE score =", mae(y_pred, y_test)) print("accuracy_score =", accuracy_score([[round(y[0])] for y in y_pred], y_test)) fn = os.path.join(settings['data-base'], 'nn_tanh3.pickle') pickle.dump(mlp, open(fn, 'wb'))
def RMSE(y2_test, y_predict): return np.sqrt(mse(y2_test, y_predict))
data_diff = data.diff(periods = 1) data_diff = data_diff[1:] ### ACF and PACF for differenced Time Series ### plot_acf(data_diff) plot_pacf(data_diff) ### Stationarity Conversion ### second_diff = data.diff(periods = 2) second_diff = second_diff[2:] ### ACF and PACF for second differenced Time Series ### plot_acf(second_diff) plot_pacf(second_diff) # ARMA Model Selection (Second Differencing) # for i in range(9): train = data.values test = test_data.values model_arima = ARIMA(train,order=(9-i,2,2)) model_arima_fit = model_arima.fit() print(model_arima_fit.summary()) print('#############################################################################\n') chosen_model = ARIMA(train,order=(1,2,2)) chosen_model_fit = chosen_model.fit() forecast = chosen_model_fit.forecast(steps = len(test)) rmse_forc = mse(forecast[0],test,squared = False) print('RMSE: ',rmse_forc)
# Make predictions with the best weights deeper=True wider=False dropout=0.5 learning_Rate = 0.001 # Need to rebuild model in case it is different from the model that was trained most recently. model = build_model() model.load_weights('question_pairs_weights_deeper={}_wider={}_lr={}_dropout={}.h5'.format( deeper,wider,learning_rate,dropout)) predictions = model.predict([x_test,x_test], verbose = True) # Compare testing loss to training and validating loss print("mse "+str(mse(y_test, predictions))) max_price=82.7999880000001 min_price=-111.34997599999997 # In[314]: rk=y_test.values #print(len(rk)) normpreds=pd.read_csv("input/preds.csv") unnorm_preds=normpreds.values def unnormalize(price): price = price*(max_price-min_price)+min_price return(price)
def lasso_pred(x, y): lasso = Lasso(normalize="True", tol=1e-35, max_iter=5000) w, q = x.shape coefs = [] preds = [] alpha_min = 0 alpha_max = 0 i = 0 while i < q: start = timer() print("\n------------------------------------\n") name = "Fitting for country no.: %s" % (i + 1) print(name) alpha_min = 0.1 alpha_max = 10e30 alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) error = mse(y_true, prediction) x.iloc[:, i] = population_vector for j in range(100): #if(i == 60): #print(alpha_min, alpha_max, np.count_nonzero(lasso.coef_)) if np.count_nonzero(lasso.coef_) > 5 or np.count_nonzero( lasso.coef_) < 5: if np.count_nonzero(lasso.coef_) > 5: alpha_min = alpha_avg alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) mean_error = mse(y_true, prediction) x.iloc[:, i] = population_vector if np.count_nonzero(lasso.coef_) < 5: alpha_max = alpha_avg alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) mean_error = mse(y_true, prediction) x.iloc[:, i] = population_vector else: break if np.count_nonzero(lasso.coef_) > 5: alpha_min = 1 alpha_max = 10e20 alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg, normalize="True", tol=1e-20, max_iter=50000) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) error = mse(y_true, prediction) x.iloc[:, i] = population_vector for j in range(150): if np.count_nonzero(lasso.coef_) > 5 or np.count_nonzero( lasso.coef_) < 5: if np.count_nonzero(lasso.coef_) > 5: alpha_min = alpha_avg alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg, normalize="True", tol=1e-20, max_iter=50000) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) mean_error = mse(y_true, prediction) x.iloc[:, i] = population_vector if np.count_nonzero(lasso.coef_) < 5: alpha_max = alpha_avg alpha_avg = (alpha_min + alpha_max) / 2 lasso.set_params(alpha=alpha_avg, normalize="True", tol=1e-20, max_iter=50000) population_vector = x.iloc[:, i] x.iloc[:, i] = np.zeros(w) y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) mean_error = mse(y_true, prediction) x.iloc[:, i] = population_vector else: break assert np.count_nonzero(lasso.coef_) <= 5, "too many non-zero" if (mean_error < error): coefs_best = lasso.coef_ pred_best = prediction error = mean_error alpha_value = alpha_avg country_number = np.count_nonzero(lasso.coef_) coefs.append(coefs_best) preds.append(pred_best) end_timer = timer() - start time_statement = "Fitting time for country no. %s: " % (i + 1) print(time_statement + str(round(end_timer, 3)) + " seconds.") print("Alpha value: " + str(alpha_value)) print("Number of countries used for fitting: " + str(country_number)) i = i + 1 return coefs, preds
#plot y_test vs y_pred plt.scatter(y_test,predictions) plt.xlim([3, 5]) plt.ylim([3, 5]) plt.xlabel("Test Data", fontsize=16) plt.ylabel("Predictions", fontsize=16) np.subtract(predictions,y_test) #check accruacy print("Training set score: {:.3f}".format(lm.score(X_train, y_train))) print("Test set score: {:.3f}".format(lm.score(X_test, y_test))) #Get MSE mse(y_test,predictions) #DETERMINE OUR APP RATING BASED ON OUR CURRENT CONTENT RATING AND PRICE #Calling googleplaystore sheet that has our app data -- will need to update this to where it's saved on your drive xls = pd.ExcelFile(r'C:\Users\206581774\Documents\DataScienceClass\google_play_store.xlsx') df_app = pd.read_excel(xls, 'app') #assign x and y x_test = df_app.iloc[:,[3,4]] print(x_test) y_test = df_app.iloc[:,2] print(y_test) #predict our app score
def rmse(y_true, y_pred): return mse(y_true, y_pred)**0.5
w, losses, residuals = gd(train_X.iloc[train_idx], train_y.iloc[train_idx], train_X.iloc[test_idx], train_y.iloc[test_idx], iterations=its, learning_rate=1e-3, lr_dampening=0.999, reg=1e-7) mean_losses = mean_losses + (1 / 5) * losses mean_residuals = mean_residuals + (1 / 5) * residuals xval_preds = w[0] + np.dot(test_X, w[1:]) xval_losses_residuals[fold_it] = [ mse(test_y, xval_preds), mae(test_y, xval_preds) ] fold_it = fold_it + 1 print('Cross-validation (a) loss: %.2f +- %.2f (b) residual: %.2f +- %.2f' % ( np.mean(xval_losses_residuals[:, 0]), np.std(xval_losses_residuals[:, 0]), np.mean(xval_losses_residuals[:, 1]), np.std(xval_losses_residuals[:, 1]), )) # Final model w, final_losses, final_residuals = gd(train_X, train_y,
def reg_evaluation( ori_train_price, ori_test_price, pred_train_price, pred_test_price, # origin price y_train, train_pred, y_test, test_pred, price_split, print_result=True): under_train = (ori_train_price <= price_split).nonzero()[0] above_train = (ori_train_price > price_split).nonzero()[0] under_test = (ori_test_price <= price_split).nonzero()[0] above_test = (ori_test_price > price_split).nonzero()[0] if print_result: print("-" * 50) print("For All Price") print("Train Result ----------") get_max_min_percentage_diff(ori_train_price, pred_train_price) print("RMSLE is ", mse(y_train, train_pred, squared=False)) print("R^2 is ", r2(y_train, train_pred)) print("Mean Absolute Percentage Error is ", mape(ori_train_price, pred_train_price)) print("Mean Absolute Error is ", mae(ori_train_price, pred_train_price)) print("\nTest Result ----------") get_max_min_percentage_diff(ori_test_price, pred_test_price) print("RMSLE is ", mse(y_test, test_pred, squared=False)) print("R^2 is ", r2(y_test, test_pred)) print("Mean Absolute Percentage Error is ", mape(ori_test_price, pred_test_price)) print("Mean Absolute Error is ", mae(ori_test_price, pred_test_price)) print("-" * 50) print("For price under $%d" % price_split) print("Train Result ----------") get_max_min_percentage_diff(ori_train_price[under_train], pred_train_price[under_train]) knn_skb_select_train_msle = mse(y_train[under_train], train_pred[under_train], squared=False) print("RMSLE is ", knn_skb_select_train_msle) print("R^2 is ", r2(y_train[under_train], train_pred[under_train])) print( "Mean Absolute Percentage Error is ", mape(ori_train_price[under_train], pred_train_price[under_train])) print("Mean Absolute Error is ", mae(ori_train_price[under_train], pred_train_price[under_train])) print("\nTest Result ----------") get_max_min_percentage_diff(ori_test_price[under_test], pred_test_price[under_test]) knn_skb_select_test_msle = mse(y_test[under_test], test_pred[under_test], squared=False) print("RMSLE is ", knn_skb_select_test_msle) print("R^2 is ", r2(y_test[under_test], test_pred[under_test])) print("Mean Absolute Percentage Error is ", mape(ori_test_price[under_test], pred_test_price[under_test])) print("Mean Absolute Error is ", mae(ori_test_price[under_test], pred_test_price[under_test])) print("-" * 50) print("For price above $%d" % price_split) print("Train Result ----------") get_max_min_percentage_diff(ori_train_price[above_train], pred_train_price[above_train]) knn_skb_select_train_msle = mse(y_train[above_train], train_pred[above_train], squared=False) print("RMSLE is ", knn_skb_select_train_msle) print("R^2 is ", r2(y_train[above_train], train_pred[above_train])) print( "Mean Absolute Percentage Error is ", mape(ori_train_price[above_train], pred_train_price[above_train])) print("Mean Absolute Error is ", mae(ori_train_price[above_train], pred_train_price[above_train])) print("\nTest Result ----------") get_max_min_percentage_diff(ori_test_price[above_test], pred_test_price[above_test]) knn_skb_select_test_msle = mse(y_test[above_test], test_pred[above_test], squared=False) print("RMSLE is ", knn_skb_select_test_msle) print("R^2 is ", r2(y_test[above_test], test_pred[above_test])) print("Mean Absolute Percentage Error is ", mape(ori_test_price[above_test], pred_test_price[above_test])) print("Mean Absolute Error is ", mae(ori_test_price[above_test], pred_test_price[above_test])) plot_prediction_price( ori_train_price[under_test], pred_train_price[under_test], title="Predict Price for Item in Train Set with Price <= %d" % price_split) plot_prediction_price( ori_train_price[above_test], pred_train_price[above_test], title="Predict Price for Item in Train Set with Price > %d" % price_split) plot_prediction_price( ori_test_price[under_test], pred_test_price[under_test], title="Predict Price for Item in Test Set with Price <= %d" % price_split) plot_prediction_price( ori_test_price[above_test], pred_test_price[above_test], title="Predict Price for Item in Test Set with Price > %d" % price_split) all_temp = [] for indexes in [None, under_train, above_train]: if indexes is None: oy, py = ori_train_price, pred_train_price oly, ply = y_train, train_pred else: oy, py = ori_train_price[indexes], pred_train_price[indexes] oly, ply = y_train[indexes], train_pred[indexes] all_temp.append([ *get_max_min_percentage_diff(oy, py), mse(oly, ply, squared=False), r2(oly, ply), mape(oy, py), mae(oy, py) ]) for indexes in [None, under_test, above_test]: if indexes is None: oy, py = ori_test_price, pred_test_price oly, ply = y_test, test_pred else: oy, py = ori_test_price[indexes], pred_test_price[indexes] oly, ply = y_test[indexes], test_pred[indexes] all_temp.append([ *get_max_min_percentage_diff(oy, py), mse(oly, ply, squared=False), r2(oly, ply), mape(oy, py), mae(oy, py) ]) result_df = pd.DataFrame(np.array(all_temp), columns=[ 'Max Percentage Diff', 'Min Percentage Diff', 'RMSLE', 'R^2', 'MAPE', 'MAE' ], index=[ 'All Train', 'Train with Price <= %d' % price_split, 'Train with Price > %d' % price_split, 'All Test', 'Test with Price <= %d' % price_split, 'Test with Price > %d' % price_split, ]) return result_df
def trade_agent(lock, number_workers): # ENDLESS CYCLE for continuous exploration of window time, learning and trading while True: # for _ in range(2): class_wtupdate = Window_time_update(lock, number_workers) rmse = lambda y_true, y_pred: np.sqrt(mse(y_true, y_pred)) api_key = '......................................................' api_secret = '......................................................' client = Client(api_key, api_secret) risk = 0.2 global_episodes = 10000 num_episodes = 10 h_size = 512 with lock: max_time_window_sec = np.load( 'weights_biases_numpy_arrays/max_time_window_sec.npy', allow_pickle=True) PATH_final_weights_LSTM = 'weights_biases_numpy_arrays/final_weights_biases/' state = (np.zeros([1, h_size]), np.zeros([1, h_size])) state_time = (np.zeros([1, h_size]), np.zeros([1, h_size])) commission = 0.001 # # 0.00075 list_predicts = [] list_predicts_time = [] list_date_time = [] list_price_in_state = [] total_steps = 0 global_steps = 0 for _ in range(global_episodes): genetic_modification(lock) tf.reset_default_graph() weights = { ### mine ### # 5x5 filter_size, 3 channel, 32 num_filters 'w_conv1': weight_variable(lock, shape=[5, 5, 3, 32], name='w_conv1', first_generation=False), # 4x4 filter_size, 32 channel, 64 num_filters 'w_conv2': weight_variable(lock, shape=[4, 4, 32, 64], name='w_conv2', first_generation=False), # 3x3 filter_size, 64 channel, 64 num_filters 'w_conv3': weight_variable(lock, shape=[3, 3, 64, 64], name='w_conv3', first_generation=False), # 3x3 filter_size, 64 channel, 512 num_filters 'w_conv4': weight_variable(lock, shape=[5, 5, 64, 512], name='w_conv4', first_generation=False), # fully connected, 512 inputs, 1 outputs 'w_predict': weight_variable(lock, shape=[512, 1], name='w_predict', first_generation=False), ### time ### # 5x5 filter_size, 3 channel, 32 num_filters 'w_conv1_time': weight_variable(lock, shape=[5, 5, 3, 32], name='w_conv1_time', first_generation=False), # 4x4 filter_size, 32 channel, 64 num_filters 'w_conv2_time': weight_variable(lock, shape=[4, 4, 32, 64], name='w_conv2_time', first_generation=False), # 3x3 filter_size, 64 channel, 64 num_filters 'w_conv3_time': weight_variable(lock, shape=[3, 3, 64, 64], name='w_conv3_time', first_generation=False), # 3x3 filter_size, 64 channel, 512 num_filters 'w_conv4_time': weight_variable(lock, shape=[5, 5, 64, 512], name='w_conv4_time', first_generation=False), # fully connected, 512 inputs, 1 outputs 'w_predict_time': weight_variable(lock, shape=[512, 1], name='w_predict_time', first_generation=False) } biases = { ### mine ### 'b_conv1': bias_variable(lock, shape=[32], name='b_conv1', first_generation=False), 'b_conv2': bias_variable(lock, shape=[64], name='b_conv2', first_generation=False), 'b_conv3': bias_variable(lock, shape=[64], name='b_conv3', first_generation=False), 'b_conv4': bias_variable(lock, shape=[512], name='b_conv4', first_generation=False), ### time ### 'b_conv1_time': bias_variable(lock, shape=[32], name='b_conv1_time', first_generation=False), 'b_conv2_time': bias_variable(lock, shape=[64], name='b_conv2_time', first_generation=False), 'b_conv3_time': bias_variable(lock, shape=[64], name='b_conv3_time', first_generation=False), 'b_conv4_time': bias_variable(lock, shape=[512], name='b_conv4_time', first_generation=False) } cell_mainN = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cell_timeN = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainN = mainNetwork(h_size, cell_mainN, weights, biases) timeN = timeNetwork(h_size, cell_timeN, weights, biases) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) ### mainN ### ### timeN ### with lock: weights_np_LSTM_mainQN = np.load(PATH_final_weights_LSTM + 'LSTM_weights_biases.npy', allow_pickle=True) weights_np_LSTM_timeQN = np.load( PATH_final_weights_LSTM + 'LSTM_weights_biases_time.npy', allow_pickle=True) ### mainN ### ### timeN ### cell_mainN.set_weights(weights_np_LSTM_mainQN) cell_timeN.set_weights(weights_np_LSTM_timeQN) for _ in range(num_episodes): total_steps += 1 state_exchange, best_ask_bid_volume_price = state_environment_3D( ) price_in_state = last_price() Previous_price_global = price_in_state if len(list_price_in_state) == num_episodes: list_price_in_state[0:1] = [] list_price_in_state.append(price_in_state) else: list_price_in_state.append(price_in_state) prediction, state1 = sess.run( [mainN.predict, mainN.rnn_state], feed_dict={ mainN.rawInput: [state_exchange], mainN.trainLength: 1, mainN.state_in: state, mainN.batch_size: 1 }) state = state1 predict_change_price = float(prediction) if len(list_predicts) == num_episodes: list_predicts[0:1] = [] list_predicts.append(predict_change_price) else: list_predicts.append(predict_change_price) prediction_time, state1_time = sess.run( [timeN.predict_time, timeN.rnn_state_time], feed_dict={ timeN.rawInput_time: [state_exchange], timeN.trainLength_time: 1, timeN.state_in_time: state_time, timeN.batch_size_time: 1 }) state_time = state1_time pred_time = float(prediction_time) if len(list_predicts_time) == num_episodes: list_predicts_time[0:1] = [] list_date_time[0:1] = [] list_predicts_time.append(pred_time) list_date_time.append(datetime.datetime.now()) else: list_predicts_time.append(pred_time) list_date_time.append(datetime.datetime.now()) if global_steps > 0: signal_for_trade, error_online = final_metric( list_price_in_state, list_predicts, risk, commission) class_wtupdate.time_update(signal=signal_for_trade) adjusted_predict_change_price = predict_change_price - error_online if \ predict_change_price >= 1 else predict_change_price + error_online trend_matching = int(predict_change_price) == int( adjusted_predict_change_price) ### SIGNAL and TREND ### if signal_for_trade and trend_matching: predict_change_price = adjusted_predict_change_price action_predict = ['sell', 'buy'][int(predict_change_price)] money_predict = float( client.get_asset_balance( asset='USDT').get('free')) bitcoin_predict = float( client.get_asset_balance( asset='BTC').get('free')) * price_in_state if money_predict >= 10 and action_predict == 'buy' and predict_change_price - 1 > commission: if best_ask_bid_volume_price.get('volume').get( 'buy' )[0] * price_in_state > money_predict: quantity_BTC = price_format_conversion( money_predict / price_in_state, 6) order = client.order_market_buy( symbol='BTCUSDT', quantity=quantity_BTC) quantity_true_action_predict += 1 else: best_quantity_BTC = str( best_ask_bid_volume_price.get( 'volume').get('buy')[0]) order = client.order_market_buy( symbol='BTCUSDT', quantity=best_quantity_BTC) quantity_true_action_predict += 1 for i in range(1, 10): money_predict = float( client.get_asset_balance( asset='USDT').get('free')) if money_predict >= 10: predict_change_price_next = ( (price_in_state * predict_change_price) / best_ask_bid_volume_price.get( 'price').get('buy')[i]) if predict_change_price_next - 1 > commission: volume_action_money_predict = ( best_ask_bid_volume_price. get('volume').get('buy')[i] * best_ask_bid_volume_price. get('price').get('buy')[i]) if volume_action_money_predict >= money_predict: best_quantity_BTC = price_format_conversion( money_predict / price_in_state, 6) order = client.order_market_buy( symbol='BTCUSDT', quantity= best_quantity_BTC) quantity_true_action_predict += 1 else: best_quantity_BTC = str( best_ask_bid_volume_price .get('volume').get( 'buy')[i]) order = client.order_market_buy( symbol='BTCUSDT', quantity= best_quantity_BTC) quantity_true_action_predict += 1 else: break ############# bitcoin_predict ################ if bitcoin_predict >= 10 and action_predict == 'sell' and 1 - predict_change_price > commission: if best_ask_bid_volume_price.get('volume').get( 'sell' )[0] * price_in_state > bitcoin_predict: quantity_BTC = price_format_conversion( bitcoin_predict / price_in_state, 6) order = client.order_market_sell( symbol='BTCUSDT', quantity=quantity_BTC) quantity_true_action_predict += 1 else: best_quantity_BTC = str( best_ask_bid_volume_price.get( 'volume').get('sell')[0]) order = client.order_market_sell( symbol='BTCUSDT', quantity=best_quantity_BTC) quantity_true_action_predict += 1 for i in range(1, 10): bitcoin_predict = float( client.get_asset_balance( asset='BTC').get( 'free')) * price_in_state if bitcoin_predict >= 10: predict_change_price_next = ( (price_in_state * predict_change_price) / best_ask_bid_volume_price.get( 'price').get('sell')[i]) if 1 - predict_change_price_next > commission: volume_action_bitcoin_predict = ( best_ask_bid_volume_price. get('volume').get( 'sell')[i] * best_ask_bid_volume_price. get('price').get('sell')[i] ) if volume_action_bitcoin_predict >= bitcoin_predict: best_quantity_BTC = price_format_conversion( bitcoin_predict / price_in_state, 6) order = client.order_market_sell( symbol='BTCUSDT', quantity= best_quantity_BTC) quantity_true_action_predict += 1 else: best_quantity_BTC = str( best_ask_bid_volume_price .get('volume').get( 'sell')[i]) order = client.order_market_sell( symbol='BTCUSDT', quantity= best_quantity_BTC) quantity_true_action_predict += 1 else: break list_pred_abs_price = [ list_predicts[index] * price for index, price in enumerate(list_price_in_state) ] list_pred_abs_time = [ max_time_window_sec * t for t in list_predicts_time ] list_date_time_for_pred = list_date_time[:] list_date_time_for_pred.append( list_date_time[-1] + datetime.timedelta( seconds=(list_pred_abs_time[-1]))) rmse_online = rmse( list_price_in_state[1:], list_pred_abs_price[:len(list_price_in_state[1:])]) plt.ion() plt.gca().cla() plt.subplots_adjust(bottom=0.2) plt.xticks(rotation=25) ax = plt.gca() ax.set_xticks(list_date_time_for_pred[1:]) xfmt = md.DateFormatter('%H:%M:%S') ax.xaxis.set_major_formatter(xfmt) plt.plot(list_date_time_for_pred[1:], list_pred_abs_price, linewidth=3, linestyle="--", color="blue", marker='o', label=r"Predicted price") plt.plot(list_date_time[1:], list_price_in_state[1:], linewidth=3, linestyle="-", color="red", marker='o', label=r"True price") plt.xlabel(r"Agent Predicted Time (seconds)") plt.ylabel(r"Predicted and true price (US$)") plt.title( f'Real time trading. RMSE online = {rmse_online.round(2)} US$. SIGNAL = {signal_for_trade}' ) plt.legend(loc="upper left") plt.pause(0.1) plt.show() time.sleep(int(abs(pred_time)) * max_time_window_sec) global_steps += 1 with lock: stop_train_trade_signal = np.load( 'weights_biases_numpy_arrays/stop_train_trade.npy', allow_pickle=True) if stop_train_trade_signal: break print('total_steps_trade =', total_steps)
result1 = mlp1.predict(df1[['x','y']][8:10]) result2 = mlp2.predict(df2[['a','b']][8:10]) dif1 = abs(df1['z'][8:10] - abs(result1))/df1['z'][8:10] mymape1 = 100/len(result1) * dif1.sum() dif2 = abs(df2['c'][8:10] - abs(result2))/df2['c'][8:10] mymape2 = 100/len(result2) * dif2.sum() print('mymape1', mymape1) print('mymape2', mymape2) mae1 = mae(df1['z'][8:10], result1) mape1 = 100*mae1 mae2 = mae(df2['c'][8:10], result2) mape2 = 100*mae2 print('mape1', mape1) print('mape2', mape2) print('mae1', mae1) print('mae2', mae2) mse1 = mse(df1['z'][8:10], result1) mse2 = mse(df2['c'][8:10], result2) print('mse1', mse1) print('mse2', mse2)
def main(): df = pd.read_csv("dataBefore_5000.csv") print(df.head()) X = df[[ "calls_up", "calls_down", "starts_up", "starts_down", "availability", "alarms" ]] print(X) print(X.shape) TARGETS = df[["ave_callTime"]] print(TARGETS) # Display raw data plt.figure(1) plt.subplot(2, 1, 1) plt.scatter(X.calls_up, TARGETS, c='b', s=20, alpha=0.5, label='call_up [#]') #plt.xlabel("[#]") plt.ylabel("Call time [s]") #plt.title("Call time vs calls up") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.subplot(2, 1, 2) plt.scatter(X.calls_down, TARGETS, c='r', s=20, alpha=0.5, label='call_down [#]') #plt.xlabel("[#]") plt.ylabel("Call time [s]") #plt.title("Call time vs calls down") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.figure(2) plt.subplot(2, 1, 1) plt.scatter(X.starts_up, TARGETS, c='c', s=20, alpha=0.5, label='starts_up [#]') #plt.xlabel("[#]") plt.ylabel("Call time [s]") #plt.title("Call time vs starts up") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.subplot(2, 1, 2) plt.scatter(X.starts_down, TARGETS, c='m', s=20, alpha=0.5, label='starts_down [#]') #plt.xlabel("[#]") plt.ylabel("Call time [s]") #plt.title("Call time vs starts down") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.figure(3) plt.subplot(2, 1, 1) plt.scatter(X.availability, TARGETS, c='g', s=20, alpha=0.5, label='availability [%]') #plt.xlabel("[%]") plt.ylabel("Call time [s]") #plt.title("Call time vs availability") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.subplot(2, 1, 2) plt.scatter(X.alarms, TARGETS, c='k', s=20, alpha=0.5, label='alarms [#]') #plt.xlabel("[#]") plt.ylabel("Call time [s]") #plt.title("Call time vs alarms") plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) # Apply Linear Regression Model lm = LinearRegression() lm.fit(X, TARGETS.ave_callTime) plt.figure(4) plt.scatter(TARGETS, lm.predict(X), c='b', s=30, alpha=0.5) plt.xlabel("Measured call time [s]") plt.ylabel("Predicted call time [s]") plt.title("Predicted vs Measured (Linear Regression)") x = [0, 50] y = x lines = plt.plot(x, y) plt.setp(lines, color='k', linewidth=2.0) plt.xlim((0, 50)) plt.ylim((0, 50)) print("Linear regression model \n Before 5000 dataset") print( " Regresion coefficients: \n [calls_up, calls_down, starts_up, starts_down, availability, alarms] \n =", lm.coef_) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X)), 5)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X)), 5)) sampleId = np.linspace(1, 5000, 5000) #print(sampleId) #print(sampleId.shape) plt.figure(5) plt.subplot(2, 1, 1) plt.plot(sampleId, lm.predict(X), 'go', markersize=2, alpha=0.5, label='Predicted call time [s] vs Id \nLinear Regression') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 5000)) plt.ylim((0, 50)) plt.subplot(2, 1, 2) plt.plot(sampleId, TARGETS, 'bo', markersize=2, alpha=0.5, label='Measured call time [s] vs Id \nRaw data') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 5000)) plt.ylim((0, 50)) # Check the model with a single parameter lm = LinearRegression() lm.fit(X[['calls_up']], TARGETS) print(" *****") print(" calls_up") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['calls_up']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['calls_up']])), 3)) lm = LinearRegression() lm.fit(X[['calls_down']], TARGETS) print(" calls_down") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['calls_down']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['calls_down']])), 3)) lm = LinearRegression() lm.fit(X[['starts_up']], TARGETS) print(" starts_up") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['starts_up']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['starts_up']])), 3)) lm = LinearRegression() lm.fit(X[['starts_down']], TARGETS) print(" starts_down") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['starts_down']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['starts_down']])), 3)) lm = LinearRegression() lm.fit(X[['availability']], TARGETS) print(" availability") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['availability']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['availability']])), 3)) lm = LinearRegression() lm.fit(X[['alarms']], TARGETS) print(" alarms") print(" Regresion coefficients =", lm.coef_[0]) print(" Regression intercept =", lm.intercept_) print(" Mean squared error =", round(mse(TARGETS, lm.predict(X[['alarms']])), 3)) print(" R2 score =", round(r2_score(TARGETS, lm.predict(X[['alarms']])), 3)) print(" *****") # Divide dataset randomly. Use train_test_sprit X_train, X_test, Y_train, Y_test = cv.train_test_split(X, TARGETS, test_size=0.4, random_state=5) print(" X_train", X_train.shape) print(" X_test", X_test.shape) print(" Y_train", Y_train.shape) print(" Y_test", Y_test.shape) lm = LinearRegression() lm.fit(X_train, Y_train) pred_train = lm.predict(X_train) pred_test = lm.predict(X_test) print(" Train and test dataset") print( " Regression coefficients: \n [calls_up, calls_down, starts_up, starts_down, availability, alarms] \n =", lm.coef_) print(" Regression intercept =", lm.intercept_) print(" Mean squared error with X_train and Y_train =", round(mse(Y_train, lm.predict(X_train)), 5)) print(" R2 score with X_train and Y_train =", round(r2_score(Y_train, lm.predict(X_train)), 3)) print(" Mean squared error with X_test and Y_test =", round(mse(Y_test, lm.predict(X_test)), 5)) print(" R2 score with X_test and Y_test =", round(r2_score(Y_test, lm.predict(X_test)), 3)) # Apply Ridge Regression Model rmodel = Ridge(alpha=0.1) rmodel.fit(X_train, Y_train) plt.figure(6) plt.scatter(Y_train, rmodel.predict(X_train), c='c', s=30, alpha=0.5) plt.xlabel("Measured call time [s]") plt.ylabel("Predicted call time [s]") plt.title("Predicted vs measured (Ridge Regression)") x = [0, 50] y = x lines = plt.plot(x, y) plt.setp(lines, color='k', linewidth=2.0) plt.xlim((0, 50)) plt.ylim((0, 50)) print("Ridge regression model \n Train dataset") print(" Mean squared error =", round(mse(Y_train, rmodel.predict(X_train)), 5)) print(" R2 score =", round(r2_score(Y_train, rmodel.predict(X_train)), 5)) sampleId = np.linspace(1, 3000, 3000) plt.figure(7) plt.subplot(2, 1, 1) plt.plot(sampleId, rmodel.predict(X_train), 'go', markersize=2, alpha=0.5, label='Predicted call time [s] vs Id \nRidge Regression') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 3000)) plt.ylim((0, 50)) plt.subplot(2, 1, 2) plt.plot(sampleId, Y_train, 'bo', markersize=2, alpha=0.5, label='Measured call time [s] vs Id \nRaw data') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 3000)) plt.ylim((0, 50)) # Apply Random Forest Regression Model rfmodel = RandomForestRegressor() rfmodel.fit(X_train, Y_train.ave_callTime) print(X_train) print(Y_train) plt.figure(8) plt.scatter(Y_train, rfmodel.predict(X_train), c='g', s=30, alpha=0.5) plt.xlabel("Measured call time [s]") plt.ylabel("Predicted call time [s]") plt.title("Predicted vs measured (Random Forest Regression)") x = [0, 50] y = x lines = plt.plot(x, y) plt.setp(lines, color='k', linewidth=2.0) plt.xlim((0, 50)) plt.ylim((0, 50)) print("Random Forest Regression model \n Train dataset") print(" Mean squared error =", round(mse(Y_train, rfmodel.predict(X_train)), 5)) print(" R2 score =", round(r2_score(Y_train, rfmodel.predict(X_train)), 5)) plt.figure(9) plt.subplot(2, 1, 1) plt.plot(sampleId, rfmodel.predict(X_train), 'go', markersize=2, alpha=0.5, label='Predicted call time [s] vs Id \nRandom Forest Regression') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 3000)) plt.ylim((0, 50)) plt.subplot(2, 1, 2) plt.plot(sampleId, Y_train, 'bo', markersize=2, alpha=0.5, label='Measured call time [s] vs Id \nRaw data') plt.legend(loc="upper right", bbox_to_anchor=[1, 1], ncol=2, shadow=True, fancybox=True) plt.xlim((0, 3000)) plt.ylim((0, 50)) plt.show()
p=2, metric='minkowski', metric_params=None, n_jobs=None) reg.fit(Xtrn, ytrn) preds = reg.predict(Xtst) preds = np.transpose(preds) print('Make and save evaluations') # make the coloumn names and initial df col_names = ['GPL', 'Model', 'Metric', 'Value', 'GeneIdx'] df_eval = pd.DataFrame(columns=col_names) # get metrics mae_values = mae(ydata_aGPL, preds, multioutput='raw_values') df_eval = add_eval_to_df(df_eval, mae_values, 'mae', 'SampleKNN', col_names, aGPL) rmse_values = np.sqrt( mse(ydata_aGPL, preds, multioutput='raw_values')) # correct to have ytst_GL cvrmse_values = rmse_values / np.mean(ydata_aGPL, axis=0) # correct to have ytst_GL df_eval = add_eval_to_df(df_eval, cvrmse_values, 'cvrmse', 'SampleKNN', col_names, aGPL) # save the dataframe df_eval.to_csv(fp_save + '%s_SampleKNN_evals.tsv' % aGPL, sep='\t', header=True, index=False) print('It took', int((time.time() - tic0) / 60), 'minutes for the script to run')
def test_result(model, n_tests, test_data, test_labels, test_data_raw, library="scikit"): """ params: library - default "scikit", other option: "torch" params: test_data - [pandas DF or torch] - input data params: test_labels - [pandas DF or torch] - labels data params: test_data_raw - [pandas DF] - input data before pipeline preprocesing """ sum_errors = [] if library == "scikit": prediction_all = model.predict(test_data) nn_mse = mse(test_labels, prediction_all) nn_rmse = np.sqrt(nn_mse) score = model.score(test_data, test_labels) elif library == "torch": prediction_all = model(test_data.float()) nn_mse = mse(test_labels.numpy(), prediction_all.detach().numpy()) nn_rmse = np.sqrt(nn_mse) score = 0 else: print("error, choose scikit or torch library") for sample in range(n_tests): if library == "scikit": prediction = model.predict(test_data)[sample] y_real_value = test_labels.iloc[sample] else: prediction = model(test_data[sample].float()).item() y_real_value = test_labels[sample].item() country = test_data_raw.iloc[sample]['country_from'] trans = test_data_raw.iloc[sample]['transmission'] fuell = test_data_raw.iloc[sample]['fuell'] milage = test_data_raw.iloc[sample]['milage'] engine_power = test_data_raw.iloc[sample]['engine_power'] year = test_data_raw.iloc[sample]['year'] brand = test_data_raw.iloc[sample]['car_brand'] car_model = test_data_raw.iloc[sample]['car_model'] error_percentage = ((-(y_real_value - prediction) / y_real_value) * 100) sum_errors.append(np.absolute(error_percentage)) max_error = max(sum_errors) print( "pred: {:7.0f}, real: {:7.0f}, err.rate: {:6.2f}%, country: {:16}, trans: {:13}, fuell: {:8}, br: {:15}, md: {:13}, year: {:4}, milage: {:6.0f}, pwr: {:.0f}" .format(prediction, y_real_value, error_percentage, country, trans, fuell, brand, car_model, year, milage, engine_power)) final_log = 'average error: {:7.2f}%, median error: {:7.2f}%, absolute error: {:7.0f}, score: {:7.3f}, max error: {:7.2f}%, set size: {}, lib: {}'.format( np.mean(sum_errors), np.median(sum_errors), nn_rmse, score, max_error, (test_data_raw.shape[0] / 2) * 10, library) print(final_log) send_email(f"car prediction training completed, with results {final_log}") with open("learning_history.txt", "a") as text_file: today = datetime.datetime.now() text_file.write("\n{} {}".format(today, final_log))
print(h) print("reshaping data...") Xs = dict() ys = dict() for h in feats: samples = len(feats[h]) // 384 Xs[h] = np.array(feats[h]).reshape((samples, 6, 8, 8)) ys[h] = np.array(labels[h]) mse_scores = dict() acc_scores = dict() for h in feats: y_pred = mlp.predict(np.array(Xs[h])) mse_scores[h] = mse(y_pred, ys[h]) acc_scores[h] = accuracy_score([[round(y[0])] for y in y_pred], ys[h]) acc_data = [] mse_data = [] h_data = [] for h in sorted(feats.keys()): h_data.append(h) mse_data.append(mse_scores[h]) acc_data.append(1 - acc_scores[h]) # fig, ax1 = plt.subplots() # fig.suptitle('Tree Height v. Prediction Error') # plt1 = ax1.plot(h_data, mse_data, color='blue', label='mse') # ax1.set_ylabel('mean squared regression error (MSE)') # ax1.set_xlabel('game tree height, h')
import pandas as pd import numpy as np import pickle from sklearn.metrics import mean_squared_error as mse import warnings warnings.simplefilter(action='ignore') data = pd.read_csv('../../data/province-biweek-counts.csv') with open('../../output/all_prov_no_prob/prov_10_no_prob/prov_10_for_2013.pkl', 'rb') as file: forecast = pickle.load(file) province = 10 year = 2013 true_df = data.loc[(data['province'] == province) & (data['year'] == year)] biweek_cases = true_df['cases'].tolist() total = sum(biweek_cases) peak = max(biweek_cases) peak_biweek = biweek_cases.index(peak) + 1 biweek_rmse = np.sqrt(mse(biweek_cases, forecast['biweek_cases'])) print(biweek_rmse) year_total_rmse = np.sqrt((total - forecast['year_total'])**2) print(forecast['year_total']) print(year_total_rmse)
def normalizedByMeanRMSE(real,predicted): return np.sqrt(mse(real,predicted))/(np.concatenate((real,predicted)).mean())
Xtest = scaler.transform(xtest.reshape(-1, 1)) degs = np.arange(1, 21, 1) ndegs = np.max(degs) mse_train = np.empty(ndegs) mse_test = np.empty(ndegs) ytest_pred_stored = np.empty(ndegs, dtype=np.ndarray) for deg in degs: model = LinearRegression() poly_features = PolynomialFeatures(degree=deg, include_bias=False) Xtrain_poly = poly_features.fit_transform(Xtrain) model.fit(Xtrain_poly, ytrain) ytrain_pred = model.predict(Xtrain_poly) Xtest_poly = poly_features.transform(Xtest) ytest_pred = model.predict(Xtest_poly) mse_train[deg - 1] = mse(ytrain_pred, ytrain) mse_test[deg - 1] = mse(ytest_pred, ytest) ytest_pred_stored[deg - 1] = ytest_pred # Plot MSE vs degree fig, ax = plt.subplots() mask = degs <= 15 ax.plot(degs[mask], mse_test[mask], color='r', marker='x', label='test') ax.plot(degs[mask], mse_train[mask], color='b', marker='s', label='train') ax.legend(loc='upper right', shadow=True) plt.xlabel('degree') plt.ylabel('mse') save_fig('polyfitVsDegree.pdf') plt.show() # Plot fitted functions
def create_prophet_m(self,app_name,z1,delay=24): import pandas as pd import pymysql import warnings warnings.filterwarnings("ignore") from datetime import datetime, timedelta import logging from tqdm import tqdm from fbprophet import Prophet from sklearn.metrics import mean_squared_error as mse import math ### --- For realtime pred ---### full_df = z1.bw.iloc[0:len(z1)] full_df = full_df.reset_index() full_df.columns = ['ds','y'] #removing outliers q50 = full_df.y.median() q100 = full_df.y.quantile(1) q75 = full_df.y.quantile(.75) #print(max(train_df.y)) if((q100-q50) >= (2*q75)): #print('ind') full_df.loc[full_df.y>=(2*q75),'y'] = None #-- Realtime prediction --## #model model_r = Prophet(yearly_seasonality=False,changepoint_prior_scale=.2) model_r.fit(full_df) future_r = model_r.make_future_dataframe(periods=delay,freq='H') forecast_r = model_r.predict(future_r) forecast_r.index = forecast_r['ds'] #forecast pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1)+delay)]) pred_r=pred_r.reset_index() #--- completes realtime pred ---# train_end_index=len(z1.bw)-delay train_df=z1.bw.iloc[0:train_end_index] #train_df= train_df[train_df<cutter] test_df=z1.bw.iloc[train_end_index:len(z1)] train_df=train_df.reset_index() test_df=test_df.reset_index() train_df.columns=['ds','y'] #--- removing outliers in trainset ---# q50 = train_df.y.median() q100 = train_df.y.quantile(1) q75 = train_df.y.quantile(.75) #print(max(train_df.y)) if((q100-q50) >= (2*q75)): #print('ind') train_df.loc[train_df.y>=(2*q75),'y'] = None test_df.columns=['ds','y'] #print('len of testdf = ',len(test_df)) #model model = Prophet(yearly_seasonality=False,changepoint_prior_scale=.2) model.fit(train_df) future = model.make_future_dataframe(periods=len(test_df),freq='H') forecast = model.predict(future) forecast.index = forecast['ds'] #forecast pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)]) pred=pred.reset_index() pred_df=pd.merge(test_df,pred,on='ds',how='left') pred_df.dropna(inplace=True) df=pd.DataFrame() if(len(pred_df)>0): pred_df['error_test']=pred_df.y-pred_df.yhat MSE=mse(pred_df.y,pred_df.yhat) RMSE=math.sqrt(MSE) pred_df['APE']=abs(pred_df.error_test*100/pred_df.y) MAPE=pred_df.APE.mean() #print("App name:",app_name) #print("MSE :",MSE) #print("RMSE :",RMSE) #print("MAPE :",MAPE) q98=pred_df['APE'].quantile(0.98) mape_q98=pred_df['APE'][pred_df.APE<pred_df['APE'].quantile(0.98)].mean() df = pd.DataFrame({'length':len(z1),#'predicted_t':[forcast_lag], 'test_rmse':RMSE, 'test_mape':MAPE, #'test_ape_98':q98, 'test_mape_98':mape_q98}, index=[app_name]) return(df,model,forecast,pred_df,pred_r)
def _fit_nonbayes(self): self.session = {"tf_session": None, "saver": None, "ensemble": None} # define tf variables self._init_MLPGaussianRegressor() self.session["tf_session"] = tf.Session() self.session["tf_session"].run(tf.global_variables_initializer()) # don't want momentum/history of weights from optimization self.session["saver"] = tf.train.Saver( [_v for _v in tf.global_variables() if "RMSProp" not in _v.name]) for model in self.session["ensemble"]: self.session["tf_session"].run( tf.assign(model.output_mean, self.train_data.target_mean)) self.session["tf_session"].run( tf.assign(model.output_std, self.train_data.target_std)) # keep value of minibatch loss so convergence can be checked at end self.loss = [[] for ii in range(self.Nensemble)] maxiter_per_minibatch = 10 num_minibatch = max([1, int(self.maxiter / maxiter_per_minibatch)]) #for itr in range(self.maxiter): # for model in self.session["ensemble"]: for model_idx, model in enumerate(self.session["ensemble"]): cntr = 0 for batch_itr in range(num_minibatch): # can train on distinct mini batches for each ensemble x, y = self.train_data.next_batch() feed = {model.input_data: x, model.target_data: y} for minibatch_iter in range(maxiter_per_minibatch): if self.method == "nonbayes_dropout": feed.update({model.dr: self.method_args["keep_prob"]}) if self.method == "nonbayes": _, loss = self.session["tf_session"].run( [model.train_op, model.loss_value], feed) if self.method == "nonbayes-mdn": _ = self.session["tf_session"].run([model.train_op], feed) elif self.method == "nonbayes_dropout": _, loss = self.session["tf_session"].run( [model.train_op, model.nll], feed) if np.mod(cntr, 10) == 0: self.loss[model_idx].append(loss) cntr += 1 if np.mod(cntr, 100) == 0: # decrease learning rate self.session["tf_session"].run(tf.assign(model.lr,\ self.method_args["learning_rate"]*(self.method_args["decay_rate"]**(cntr/100)))) if False: # do final local gradient descent on full data set model.set_optimizer(toy_argparse({"opt_method":"gradientdescent",\ "learning_rate":self.method_args["learning_rate"]})) feed = { model.input_data: self.train_data.xs_standardized, model.target_data: self.train_data.ys } loss_before = self.session["tf_session"].run( [model.loss_value], feed) for cntr in range(self.maxiter): _ = self.session["tf_session"].run([model.train_op], feed) loss_after = self.session["tf_session"].run([model.loss_value], feed) print("loss before = {} loss after = {}".format( loss_before, loss_after)) # for easy slicing upon analysis self.loss = np.asarray(self.loss) # pass in standardized data pred_mean, pred_std = self._predict_nonbayes( self.train_data.xs_standardized, self.Nensemble) rmse = np.sqrt(mse(self.train_data.ys, pred_mean)) return rmse