예제 #1
0
def best_split_lin_reg_dynamic(x, y):
    sort_i = np.argsort(x)

    n = len(y)

    xm = x[sort_i] - np.mean(x)
    ym = y[sort_i] - np.mean(y)

    xy_sum_false = 0
    x2_sum_false = 0
    xy_sum_true = np.sum(xm * ym)
    x2_sum_true = np.sum(xm ** 2)

    node_betta = xy_sum_true / x2_sum_true
    node_betta = 1 if np.isnan(node_betta) else node_betta
    node_score = mse(ym + np.mean(y), node_betta * xm)

    best_score = np.inf
    split_value = xm[0] + np.mean(x)
    split_ind = x[sort_i[0]]

    for i in range(1, n):
        xy_sum_false += xm[i] * ym[i]
        x2_sum_false += xm[i] ** 2

        xy_sum_true -= xm[i] * ym[i]
        x2_sum_true -= xm[i] ** 2

        print xm[i] + np.mean(x)

        false_betta = xy_sum_false / x2_sum_false
        # false_betta = 0 if np.isnan(false_betta) else false_betta
        false_ratio = i / float(n)
        false_score = mse(ym[:i] + np.mean(y), false_betta * xm[:i]) if len(ym[:i]) else 0

        true_betta = xy_sum_true / x2_sum_true
        # true_betta = 0 if np.isnan(true_betta) else true_betta
        true_ratio = (n - i) / float(n)
        true_score = mse(ym[i:] + np.mean(y), true_betta * xm[i:])

        score = node_score - (false_ratio * false_score + true_ratio * true_score)
        scores = np.array([false_score, true_score])
        score = scores[np.argmin(scores)]

        if score < best_score:
            best_score = score
            split_value = x[sort_i[i]]
            split_ind = i

    return sort_i[:split_ind], sort_i[split_ind:], split_value, best_score
    def _calculate_mse(self, reconstruction_tof, original_tof, test_data):
        # reconstruction_tof = [i for j, i in enumerate(reconstruction_tof) if j % 5 == 0]
        # original_tof = [i for j, i in enumerate(original_tof) if j % 5 == 0]
        # test_data = [i for j, i in enumerate(test_data) if j % 5 == 0]
        # Dropping -1 in test_data together with corresponding tof
        test_data, reconstruction_tof, original_tof, _ = self._remove_unobserved_data(
            test_data, reconstruction_tof, original_tof
        )
        mse_recon = -1
        mse_origin = -1
        if len(reconstruction_tof) > 0 and len(original_tof) > 0:
            mse_recon = mse(test_data, reconstruction_tof)
            mse_origin = mse(test_data, original_tof)

        return mse_recon, mse_origin
예제 #3
0
def best_split_lin_reg(x_vect, y):
    node_lg = LinearRegression(n_jobs=NUM_CORES).fit(x_vect[:, np.newaxis], y)
    node_score = mse(y, node_lg.predict(x_vect[:, np.newaxis]))

    best_score = -np.inf
    best_split_value = None
    best_true_inds = None
    best_false_inds = None

    for split_value in np.unique(x_vect):
        true_inds = x_vect > split_value
        true_ratio = np.sum(true_inds) / float(len(y))
        true_score = ling_reg_score(true_inds, x_vect, y)

        false_inds = np.invert(true_inds)
        false_ratio = 1 - true_ratio
        false_score = ling_reg_score(false_inds, x_vect, y)

        score = node_score - (true_ratio * true_score + false_ratio * false_score)

        if score > best_score:
            best_score = score
            best_split_value = split_value
            best_true_inds = true_inds
            best_false_inds = false_inds

    return best_false_inds, best_true_inds, best_split_value, best_score
예제 #4
0
def fit_and_predict(clf, rgr, X, y_clf, y_rgr, train, test, out_folder, fold):
    clf_model = clf.fit(X[train], y_clf[train])
        
    y_clf_true = y_clf[test]
    y_rgr_true = y_rgr[test]
    y_clf_pred = clf_model.predict(X[test])
    
    class_scores = np.array(precision_recall_fscore_support(y_clf_true,
                                                            y_clf_pred))
    micro_f1 = f1_score(y_clf_true, y_clf_pred, average='micro')
    macro_f1 = f1_score(y_clf_true, y_clf_pred, average='macro')
    
    rgr_model = rgr.fit(X[train], y_rgr[train])
    y_rgr_pred = rgr_model.predict(X[test])
    
    general_r2 = r2_score(y_rgr_true, y_rgr_pred)
    mse_score  = mse(y_rgr_true, y_rgr_pred)
    mrse_score = mrse(y_rgr_true, y_rgr_pred)
    
    clf_pred_fpath = os.path.join(out_folder, '%d-clf.pred' % fold)
    clf_true_fpath = os.path.join(out_folder, '%d-clf.true' % fold)
    
    rgr_pred_fpath = os.path.join(out_folder, '%d-rgr.pred' % fold)
    rgr_true_fpath = os.path.join(out_folder, '%d-rgr.true' % fold)
    
    np.savetxt(clf_pred_fpath, y_clf_pred, fmt="%d")
    np.savetxt(clf_true_fpath, y_clf_true, fmt="%d")
    
    np.savetxt(rgr_pred_fpath, y_rgr_pred)
    np.savetxt(rgr_true_fpath, y_rgr_true)
    
    return class_scores, micro_f1, macro_f1, general_r2, mse_score,\
            mrse_score
예제 #5
0
def random_forest(X_train, y_train, y_test, X_test, num_trees=100):
	model = RandomForestRegressor(n_estimators=num_trees, oob_score=True)
	model.fit(X_train, y_train)
	prediction = model.predict(X_test)
	mean_squared_error = mse(y_test, model.predict(X_test))
	r2 = model.score(X_test, y_test)
	return (mean_squared_error, r2)
예제 #6
0
def eval_sts(ycat, y, name):
    """ Evaluate given STS regression-classification predictions and print results. """
    ypred = loader.sts_categorical2labels(ycat)
    pr = pearsonr(ypred, y)[0]
    print('%s Pearson: %f' % (name, pr,))
    print('%s Spearman: %f' % (name, spearmanr(ypred, y)[0],))
    print('%s MSE: %f' % (name, mse(ypred, y),))
    return pr
예제 #7
0
def ling_reg_score(indices, x, y):
    if not np.any(indices):
        return 1
    leaf_x = x[indices][:, np.newaxis]
    leaf_y = y[indices]
    lg = LinearRegression(n_jobs=NUM_CORES).fit(leaf_x, leaf_y)
    score = mse(leaf_y, lg.predict(leaf_x))
    return score
예제 #8
0
def linear_regression(model, features, target, y_test, X_test):
	model.fit(features, target)
	intercept = model.intercept_
	coef = np.hstack([intercept, model.coef_])
	prediction = model.predict(X_test)
	mean_squared_error = mse(y_test, model.predict(X_test))
	r2 = model.score(X_test, y_test)
	return (mean_squared_error, r2)
예제 #9
0
def fit_select_best(X, y):
    """
    Selects the best fit of the estimators already implemented by choosing the
    model with the smallest mean square error metric for the trained values.
    """
    models = [fit(X,y) for fit in [fit_linear, fit_quadratic]]
    errors = map(lambda model: mse(y, model.predict(X)), models)

    return min(zip(models, errors), key=itemgetter(1))[0]
예제 #10
0
파일: vizDr.py 프로젝트: dot2dotseurat/viz
def regress(attributes, targets, model):
    # Split data into 'test' and 'train' for cross validation
    splits = cv.train_test_split(attributes, targets, test_size=0.2)
    X_train, X_test, y_train, y_test = splits

    model.fit(X_train, y_train)
    y_true = y_test
    y_pred = model.predict(X_test)
    print("Mean squared error = {:0.3f}".format(mse(y_true, y_pred)))
    print("R2 score = {:0.3f}".format(r2_score(y_true, y_pred)))
def linear(model, features, target, y_test, X_test):
	model.fit(features, target)
	intercept = model.intercept_
	coef = np.hstack([intercept, model.coef_])
	if X_test is not None:
		prediction = model.predict(X_test)
		mean_squared_error = mse(y_test, model.predict(X_test))
		r2 = model.score(X_test, y_test)
		return mean_squared_error, r2
	else:
		return coef
def validate_k_fold(df, poly_degree, n_folds=5, trainer=train_model):
    features, prices = get_regression_ctx(df, poly_degree)
    mse_scores = []

    for train_idx, val_idx in KFold(len(df), n_folds=n_folds):
        features_train, features_val = features[train_idx], features[val_idx]
        prices_train, prices_val = prices[train_idx], prices[val_idx]
        model = trainer(features_train, prices_train)
        mse_scores.append(mse(prices_val, model.predict(features_val)))

    return pd.Series(mse_scores).mean()
예제 #13
0
파일: train.py 프로젝트: aigamedev/nuclai15
def main():
    # get the processed data
    X,y = preprocess_data()

    # get the dummy clf: Very important, it creates a baseline!
    dummy_clf = get_dummy_clf()
    dummy_clf.fit(X, y)
    y_hat = dummy_clf.predict(y)

    # Get the baseline predictions for x and y
    print "Dummy MSE x", mse(y[:,0], y_hat[:,0])
    print "Dummy MSE y", mse(y[:,1], y_hat[:,1])

    # create 5 different crossvalidation folds
    ss = ShuffleSplit(len(y), n_iter=5, random_state=0)

    scores_x = []
    scores_y = []
    for i, (train_index, test_index) in enumerate(ss):
        # Choose a classifier
        #clf = get_linear_clf()
        clf = get_nn_clf()
        clf.fit(X[train_index], y[train_index])
        y_hat = clf.predict(X[test_index])

        # Save the score for each fold
        score_x = mse(y[test_index,0], y_hat[:,0])
        score_y = mse(y[test_index,1], y_hat[:,1])


        # You can print the coefficients/intercept for the linear classifier
        #print clf.steps[-1][1].coef_,clf.steps[-1][1].intercept_

        scores_x.append(score_x)
        scores_y.append(score_y)
        print scores_x,scores_y


    print "MSE CV x", np.array(scores_x).mean()
    print "MSE CV y", np.array(scores_y).mean()
    def calculate_mse(self, region, data, month, year):
        if self.spectrum_selection != 0:
            reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum(
                region, self.addition_technique, self.spectrum_selection
            )
        else:
            reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum(
                region, self.addition_technique
            )
        test_data = trajectories_full_dates_periodic(
            data, month, year, self.length_of_periodicity,
            self.window_interval, self.minute_interval
        )
        reconstruction_tof = [i for j, i in enumerate(reconstruction_tof) if j % 5 == 0]
        original_tof = [i for j, i in enumerate(original_tof) if j % 5 == 0]
        test_data = [i for j, i in enumerate(test_data) if j % 5 == 0]
        # Dropping -1 in test_data together with corresponding tof
        test_data, reconstruction_tof, original_tof, _ = self._remove_unobserved_data(
            test_data, reconstruction_tof, original_tof
        )
        mse_recon = -1
        mse_origin = -1
        if len(reconstruction_tof) > 0 and len(original_tof) > 0:
            mse_recon = mse(test_data, reconstruction_tof)
            mse_origin = mse(test_data, original_tof)
        rospy.loginfo("Calculated MSE for original tof Region %s: %.2f" % (region, mse_origin))
        rospy.loginfo("Calculated MSE for reconstruction tof Region %s: %.2f" % (region, mse_recon))

        # temp_recon = np.sqrt(mse_recon)
        # sum_recon = 0
        # for i in test_data:
        #     sum_recon += (i - temp_recon)**2
        # print "std_dev: %f" % (np.sqrt(sum_recon / float(len(test_data) - 1)))
        # temp_recon = np.sqrt(mse_origin)
        # sum_recon = 0
        # for i in test_data:
        #     sum_recon += (i - temp_recon)**2
        # print "std_dev: %f" % (np.sqrt(sum_recon / float(len(test_data) - 1)))

        return mse_recon, mse_origin
예제 #15
0
def evaluate(model, seed=1234, evaltest=False):
    """
    Run experiment
    """
    print 'Preparing data...'
    train, dev, test, scores = load_data()
    train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed)
    
    print 'Computing training skipthoughts...'
    trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True)
    trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True)
    
    print 'Computing development skipthoughts...'
    devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True)
    devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True)

    print 'Computing feature combinations...'
    trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
    devF = np.c_[np.abs(devA - devB), devA * devB]

    print 'Encoding labels...'
    trainY = encode_labels(scores[0])
    devY = encode_labels(scores[1])

    print 'Compiling model...'
    lrmodel = prepare_model(ninputs=trainF.shape[1])

    print 'Training...'
    bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1])

    if evaltest:
        print 'Computing test skipthoughts...'
        testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True)
        testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True)

        print 'Computing feature combinations...'
        testF = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        r = np.arange(1,6)
        yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r)
        pr = pearsonr(yhat, scores[2])[0]
        sr = spearmanr(yhat, scores[2])[0]
        se = mse(yhat, scores[2])
        print 'Test Pearson: ' + str(pr)
        print 'Test Spearman: ' + str(sr)
        print 'Test MSE: ' + str(se)

        return yhat
예제 #16
0
def gradient(X_train, y_train, y_test, X_test, file_loc, target):
	'''
	Passes to grid search function within this function to pick the best parameters for each gradient boosted model depending on the target variable we are trying to predict
	'''
	grid = grid_search(file_loc, target)
	best_params = grid.best_params_
	learn_rate = best_params['learning_rate']
	n_estimators = best_params['n_estimators']
	max_feat = best_params['max_features']
	model = GradientBoostingRegressor(learning_rate=learn_rate,  n_estimators=n_estimators, max_features=max_feat)
	model.fit(X_train, y_train)
	prediction = model.predict(X_test)
	mean_squared_error = mse(y_test, model.predict(X_test))
	r2 = model.score(X_test, y_test)
	return (mean_squared_error, r2)
예제 #17
0
def run_grid_search(m, parameters, params, name, Xtrain, Ytrain, Xtest, Ytest):
	print('=' * 80)
	print("Training %s Model" % name)
	print('=' * 80)
	t0 = time()

	clf = RandomizedSearchCV(m, parameters, cv=3, n_jobs=4, verbose=3, error_score=0)
	clf.fit(Xtrain, Ytrain)
	Yhat = clf.predict(Xtest)
	print("\tDone in %1.2f seconds" % float(time() - t0))
	print("\tScore: %1.2f\n" % mse(Yhat, Ytest))

	print("Best Parameters" + str(clf.best_params_))
	print("Writing Solution")
	submit = pd.DataFrame(data={'id': ids, 'quality': Yhat})
	submit.to_csv('./submissions/'+name+'.csv', index = False)
예제 #18
0
def eval_sts(ycat, y, name, quiet=False):
    """ Evaluate given STS regression-classification predictions and print results. """
    if ycat.ndim == 1:
        ypred = ycat
    else:
        ypred = loader.sts_categorical2labels(ycat)
    if y.ndim == 1:
        ygold = y
    else:
        ygold = loader.sts_categorical2labels(y)
    pr = pearsonr(ypred, ygold)[0]
    if not quiet:
        print('%s Pearson: %f' % (name, pr,))
        print('%s Spearman: %f' % (name, spearmanr(ypred, ygold)[0],))
        print('%s MSE: %f' % (name, mse(ypred, ygold),))
    return pr
def _frame_indexing(image_idx, prob, name):
    image_matches = {}
    image_match = None

    if image_idx == -1:
        frame_prob = prob
        image_match = image_matches[name] = {}
    else:
        frame_prob = _map_frame_prob(vd_df['prob'][image_idx])
        image_match = image_matches[vd_df['frame'][image_idx]] = {}

    for i in range(0, len(vd_df) - 1):
        mse_prob = mse(frame_prob, _map_frame_prob(vd_df['prob'][i + 1]))
        image_match[mse_prob] = {}
        image_match[mse_prob]['img'] = vd_df['frame'][i + 1]

    return image_matches
    def prediction_accuracy(self, region, data, month, year, percentile=0.1, plot=False):
        if self.spectrum_selection != 0:
            reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum(
                region, self.addition_technique, self.spectrum_selection
            )
        else:
            reconstruction_tof, original_tof = self.reconstruct_tof_from_spectrum(
                region, self.addition_technique
                # region, False, 26
            )
        test_data = trajectories_full_dates_periodic(
            data, month, year, self.length_of_periodicity,
            self.window_interval, self.minute_interval
        )
        original_predict = self._get_prediction(original_tof, test_data, percentile)
        reconstr_predict = self._get_prediction(reconstruction_tof, test_data, percentile)
        _, clean_ori_pred, clean_recon_pred, indices = self._remove_unobserved_data(
            test_data, reconstr_predict, original_predict
        )
        if len(clean_ori_pred) and len(clean_recon_pred):
            mse_predict = mse(clean_ori_pred, clean_recon_pred)
        else:
            mse_predict = -1
        rospy.loginfo(
            "Calculated MSE for prediction between original and reconstruction for Region %s: %.2f" % (region, mse_predict)
        )
        if plot:
            for index in indices:
                original_predict[index] = -1
                reconstr_predict[index] = -1
            x = np.linspace(0, len(test_data), len(test_data))
            xticks = time_ticks(self.minute_interval, self.window_interval, self.periodic_type)
            plt.plot(
                x, original_predict, "-o", label="Prediction Original TOF"
            )
            plt.plot(
                x, reconstr_predict, "-^", label="Prediction Reconstruction TOF"
            )
            plt.title("Prediction for Region %s" % region)
            plt.xticks(x, xticks, rotation="vertical")
            plt.xlabel("One Week Period with %d minutes interval and %d window time" % (self.minute_interval, self.window_interval))
            plt.ylabel("Prediction (1=Anomalous, 0=Normal, -1=Unobserved)")
            plt.ylim(ymin=-2, ymax=2)

            plt.legend()
            plt.show()
예제 #21
0
    def on_epoch_end(self, epoch, logs={}):
        self.train_losses.append(mse(self.y_train, self.model.predict(self.X_train)))
        self.val_losses.append(logs.get('val_loss'))        

        if self.score_func == 'accuracy':
            true_train = np_utils.probas_to_classes(self.y_train)
            pred_train = np_utils.probas_to_classes(self.model.predict(self.X_train))
            self.add_train_scores.append(accuracy_score(true_train, pred_train))

            true_test = np_utils.probas_to_classes(self.y_test)
            pred_test = np_utils.probas_to_classes(self.model.predict(self.X_test))
            val_score = accuracy_score(true_test, pred_test)
            self.add_val_scores.append(val_score)
        elif self.score_func == 'r2_score':
            val_score = r2_score(self.y_test, self.model.predict(self.X_test))
            self.add_val_scores.append(val_score)
            self.add_train_scores.append(r2_score(self.y_train, self.model.predict(self.X_train)))    
            
        self.best_score = max(self.best_score, val_score)
        self.printCurrentStage(epoch)
def Result_Evaluation (outputpath, accuracy, testing_Labels, predict_Labels):
    acc_rate = [0, 0, 0, 0, 0]
    testingSamples = len(testing_Labels)
    if os.path.isfile(outputpath):
        os.remove(outputpath)
    with io.open(outputpath, 'a', encoding='utf-8') as output_file:
        for i in xrange(0, testingSamples):
            rounded_result = int(round(predict_Labels[i]))
            if rounded_result == testing_Labels[i]:
                acc_rate[0] += 1
                result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> spot on!\n"
                output_file.write(unicode(result_item))
            elif abs(rounded_result - testing_Labels[i])<=1:
                acc_rate[1] += 1
                result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 1 star\n"
                output_file.write(unicode(result_item))
            elif abs(rounded_result - testing_Labels[i])<=2:
                acc_rate[2] += 1
                result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 2 star\n"
                output_file.write(unicode(result_item))
            elif abs(rounded_result - testing_Labels[i])<=3:
                acc_rate[3] += 1
                result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 3 star\n"
                output_file.write(unicode(result_item))
            else:
                acc_rate[4] += 1
                result_item = str(i) + ": " + str(predict_Labels[i]) + " - " + str(testing_Labels[i]) + " --> off by 4 star\n"
                output_file.write(unicode(result_item))

        #output_file.write(unicode(additional_info))
        finalResult = " #spot on: " + str(acc_rate[0]) + '\n' + " #off by 1 star: " + str(acc_rate[1]) + '\n' + " #off by 2 star: " + str(acc_rate[2]) + '\n' + " #off by 3 star: " + str(acc_rate[3]) + '\n' + " #off by 4 star: " + str(acc_rate[4]) + '\n'
        output_file.write(unicode(finalResult))

        finalResultPercentage = " #spot on: " + str(acc_rate[0]*1.0/testingSamples) + '\n' + " #off by 1 star: " + str(acc_rate[1]*1.0/testingSamples) + '\n' + " #off by 2 star: " + str(acc_rate[2]*1.0/testingSamples) + '\n' + " #off by 3 star: " + str(acc_rate[3]*1.0/testingSamples) + '\n' + " #off by 4 star: " + str(acc_rate[4]*1.0/testingSamples) + '\n'
        output_file.write(unicode(finalResultPercentage))
        print(" #Right: " + str(acc_rate[0]*1.0/testingSamples) + '\n')
        print(" #Wrong: " + str((acc_rate[2]+acc_rate[3]+acc_rate[4])*1.0/testingSamples) + '\n')
        r2Score = r2_score(testing_Labels, predict_Labels)
        print(" #R2 score: " + str(r2Score))
        print (" #sqrt(mse): {:f}".format(np.sqrt(mse(testing_Labels, predict_Labels))))
        print("Look at the evaluation_file for details!")
예제 #23
0
            def evaluate(dsA, dsB, _scores):
                tA, tB, idsA, idsB, lengthsA, lengthsB = None, None, None, None, None, None
                e_off = 0
                ps = np.zeros((len(dsA), 5))
                op_weights_monitor = {w.name[-11:]:[] for w in op_weights}

                while e_off < len(dsA):
                    tA, tB, idsA, idsB, lengthsA, lengthsB = batchify(dsA[e_off:e_off + batch_size],
                                                                      dsB[e_off:e_off + batch_size],
                                                                      vocab["<padding>"],
                                                                      tA, tB, idsA, idsB, lengthsA, lengthsB,
                                                                      max_length=max_l, max_batch_size=batch_size)
                    size = min(len(dsA) - e_off, batch_size)
                    allowed_conds = ["/cond_%d/" % (2*i) for i in range(min(np.min(lengthsA),np.min(lengthsB)))]
                    current_weights = [w for w in op_weights if any(c in w.name for c in allowed_conds)]
                    random.shuffle(current_weights)
                    result = sess.run([model["probs"]] + current_weights[:10],
                                    feed_dict={model["inpA"]: tA[:,:size],
                                               model["inpB"]: tB[:,:size],
                                               model["idsA"]: idsA[:,:size],
                                               model["idsB"]: idsB[:,:size],
                                               model["lengthsA"]: lengthsA[:size],
                                               model["lengthsB"]: lengthsB[:size]})
                    ps[e_off:e_off+batch_size] = result[0]
                    e_off += batch_size
                    for probs, w in zip(result[1:], current_weights):
                        op_weights_monitor[w.name[-11:]].extend(probs.tolist())

                for k,v in op_weights_monitor.items():
                    hist, _ = np.histogram(np.array(v), bins=5,range=(0.0,1.0))
                    hist = (hist * 1000) / np.sum(hist)
                    print(k, hist.tolist())

                r = np.arange(1,6)
                yhat = np.dot(ps, r)
                pr = pearsonr(yhat, _scores)[0]
                sr = spearmanr(yhat, _scores)[0]
                se = mse(yhat, _scores)
                return pr, sr, se
예제 #24
0
def run_experiment(X, y_clf, y_rgr, feature_ids, out_foldpath, k=500):
    clf, rgr = create_learners()
    
    n = len(y_clf)
    train_index = np.ones(n, dtype=np.bool)
    train_index[-k:] = False
    test_index = np.logical_not(train_index)
    
    clf_model = clf.fit(X[train_index], y_clf[train_index]) 
    rgr_model = rgr.fit(X[train_index], y_rgr[train_index])
    
    clf_true = y_clf[test_index]
    clf_pred = clf_model.predict(X[test_index])
    
    rgr_true = y_rgr[test_index]
    rgr_pred = rgr_model.predict(X[test_index])
    
    clf_pred_fpath = os.path.join(out_foldpath, '%clf.pred')
    clf_true_fpath = os.path.join(out_foldpath, '%clf.true')
    
    rgr_pred_fpath = os.path.join(out_foldpath, '%rgr.pred')
    rgr_true_fpath = os.path.join(out_foldpath, '%rgr.true')
    
    np.savetxt(clf_pred_fpath, clf_pred, fmt="%d")
    np.savetxt(clf_true_fpath, clf_true, fmt="%d")
    
    np.savetxt(rgr_pred_fpath, rgr_pred)
    np.savetxt(rgr_true_fpath, rgr_true)
    
    print('Micro F1: ', f1_score(clf_true, clf_pred, average='micro'))
    print('Macro F1: ', f1_score(clf_true, clf_pred, average='macro'))
    print()
    print('R2: ', r2_score(rgr_true, rgr_pred))
    print('MAE: ', mae(rgr_true, rgr_pred))
    print('MSE: ', mse(rgr_true, rgr_pred))
    print()
    print_importance(feature_ids, 
                     clf_model.best_estimator_.feature_importances_,
                     rgr_model.best_estimator_.feature_importances_)
예제 #25
0
def evaluate(seed=1234, evaltest=False):
    """
    Run experiment
    """
    print 'Preparing data...'
    os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_factors)
    X = np.genfromtxt(os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_factors))
    print(X.size)

    y = np.genfromtxt(os.path.join(FLAGS.data_dir, FLAGS.relatedness_regression_targets))
    print(y.size)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print 'Encoding labels...'
    trainY = encode_labels(y_train)
    devY = encode_labels(y_test)

    print 'Compiling model...'
    lrmodel = prepare_model(ninputs=len(X[0]))

    print 'Training...'
    bestlrmodel = train_model(lrmodel, X_train, trainY, X_test, devY, y_test)

    if evaltest:

        print 'Evaluating...'
        r = np.arange(1, 6)
        yhat = np.dot(bestlrmodel.predict_proba(X_test, verbose=2), r)
        pr = pearsonr(yhat, y_test)[0]
        sr = spearmanr(yhat, y_test)[0]
        se = mse(yhat, y_test)
        print 'Test Pearson: ' + str(pr)
        print 'Test Spearman: ' + str(sr)
        print 'Test MSE: ' + str(se)

        return yhat
예제 #26
0
def gd(train_X,
       train_y,
       test_X,
       test_y,
       learning_rate=0.0001,
       iterations=1000,
       lr_dampening=1.0,
       reg=0):
    [m, n] = train_X.shape
    w = np.asarray(np.hstack((100, train_X.mean())))[:, np.newaxis]
    #w = np.random.randn(n + 1, 1)
    train_X = np.concatenate((np.ones((m, 1)), train_X), axis=1)

    train_y = train_y.as_matrix()

    if train_y.shape == (1, m):
        train_y = train_y.T
    if train_y.shape != (m, 1):
        print('train_y shape sould be m rows, 1 column')

    loss_history = np.zeros(
        (iterations, 2))  # keeps track of MSE [train, test]
    residual_history = np.zeros(
        (iterations, 2))  # keeps track of MAE [train, test]
    #grad_history = np.ndarray(shape=(1,n+1)) # tracks the gradient itself (debugging)
    #w_history = w # tracks weights (debugging)

    for it in range(iterations):
        predictions = np.dot(train_X, w)
        loss = predictions - train_y
        loss_train = mse(train_y, predictions)
        residual_train = mae(train_y, predictions)

        predictions = w[0] + np.dot(test_X, w[1:])
        loss_test = mse(test_y, predictions)
        residual_test = mae(test_y, predictions)

        loss_history[it] = [loss_train, loss_test]
        residual_history[it] = [residual_train, residual_test]

        grad = np.dot(train_X.T, loss) / m
        #grad_history = np.append(grad_history, grad.T, axis=0)

        if ((it + 1) % 100) == 0:
            print('it=%4d, loss=%.3f, residual=%.3f 1000*lr=%.12f' %
                  (it + 1, loss_train, residual_train, 1000 * learning_rate))

        if np.isnan(np.sum(grad)) or np.isinf(np.sum(grad)):
            print('NaN or Inf detected, stopping at it=' + str(it))
            break

        #                learning term                 regularization term
        w = w - (learning_rate /
                 (1 + it)) * grad + reg * np.dot(w.ravel(), w.ravel())
        learning_rate = lr_dampening * learning_rate
        #w_history = np.append(w_history, w)

        if np.isnan(np.sum(w)) or np.isinf(np.sum(w)):
            print('NaN or Inf detected after w update, stopping at it=' +
                  str(it))
            break

    return w.flatten(
    ), loss_history, residual_history  #, grad_history #, w_history
data_dir_path = path.join('..', 'dataset', 'target', target)
X_train, y_train, X_test, y_test = \
    read_data_from_dataset(data_dir_path)

period = (len(y_train) + len(y_test)) // 30
RPG = ReccurentPredictingGenerator(X_test, batch_size=1, timesteps=period)
prediction = []

for path_model in tqdm(listdir(path.join(write_result_out_dir, 'model'))):
    file_path = path.join(write_result_out_dir, 'model', path_model)
    best_model = load_model(file_path)
    y_test_pred = best_model.predict_generator(RPG)
    prediction.append(y_test_pred)

prediction = np.array(prediction)
list_score = []
size_test = prediction.shape[1]
y_test = y_test[-size_test:]
for i_prediction in range(prediction.shape[0])[:1]:
    pred = np.mean(prediction[:i_prediction + 1], axis=0)
    accuracy = mse(y_test, pred.flatten())
    list_score.append(accuracy)

np.save('sru', prediction)

plt.rcParams['font.size'] = 25
plt.figure(figsize=(15, 7))
plt.plot(list_score)
plt.xlabel('the number of subsets / -')
plt.ylabel('MSE / -')
plt.savefig('bagging_sru')
예제 #28
0
#ANN
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(
    Dense(30,
          activation='relu',
          input_dim=len(x_train.T),
          kernel_initializer='normal'))
model.add(Dense(30, activation='relu', kernel_initializer='normal'))
model.add(Dense(20, activation='relu', kernel_initializer='normal'))
model.add(Dense(10, activation='relu', kernel_initializer='normal'))
model.add(Dense(1, kernel_initializer='normal'))

model.compile(loss='mse', optimizer='adam')

model.fit(x_train, y_train, epochs=70, batch_size=30)

ypred = model.predict(x_test)
ypred = np.ravel(ypred)

#Inverse Transforming
ypred = sc1.inverse_transform(ypred)
y_test = sc1.inverse_transform(y_test)

ydiff = y_test - ypred

#Checking Data Validity.
rmse = np.sqrt(mse(y_test, ypred))
print('\nRoot Mean Square: ', rmse)
예제 #29
0
# gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
# gbrt.fit(X[:, None], y.ravel())

# 另外再多一組例子
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error as mse

X = X[:, None]
y = y.ravel()

X_train, X_val, y_train, y_val = tts(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mse(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,
                                      n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

y_pred = gbrt_best.predict(X_val)
# 事實上, 要使用早期停止法不一定要訓練大量的樹然後再回頭找最優, 可以直接提早停止訓練(warm_start = True)
# 以下的程式碼會在驗證誤差連續5次迭代未改善時, 會直接停止訓練
# gbrt_test = GradientBoostingRegressor(max_depth = 2, warm_start = True)

# min_val_error = float("inf")
# error_going_up = 0
# for n_estimators in range(1, 120):
#     gbrt_test.n_estimators = n_estimators
예제 #30
0
        learning_rate='adaptive',
        learning_rate_init=0.001,
        power_t=0.5,
        max_iter=175,
        shuffle=True,
        random_state=None,
        tol=0.000011,
        verbose=False,
        warm_start=False,
        momentum=0.5,
        nesterovs_momentum=True,
        early_stopping=False,
        validation_fraction=0.1,
        #                       beta_1=0.9,
        #                       beta_2=0.999,
        #                       epsilon=1e-08
    )

    mlp.fit(Xt, Yt)
    YHat = mlp.predict(Xv)
    err = mse(YHat, Yv)
    print('size:', i, 'err', err)

Xte, _ = ml.transforms.rescale(Xte, param)
Yhat = mlp.predict(Xte)
fh = open('pred_nnet%d.csv' % i, 'w')
fh.write('ID,Prediction\n')
for i, yi in enumerate(Yhat):
    fh.write('{},{}\n'.format(i, yi))

fh.close()
예제 #31
0
stack_trn = np.vstack([pred_trn_lgb['pred'], pred_trn_xgb['pred']]).transpose()
stack_tst = np.vstack([pred_lgb['pred'], pred_xgb['pred']]).transpose()

stack_folds = KFold(n_splits=4, random_state=19960101)
stack_oof = np.zeros(stack_trn.shape[0])
pred_tst['pred'] = np.zeros(stack_tst.shape[0])

for _fold, (trn_idx,
            val_idx) in enumerate(stack_folds.split(stack_trn, df_trn['收率'])):
    trn_x, trn_y = stack_trn[trn_idx], df_trn['收率'].iloc[trn_idx].values
    val_x, val_y = stack_trn[val_idx], df_trn['收率'].iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_x, trn_y)

    stack_oof[val_idx] = clf_3.predict(val_x)
    pred_tst['pred'] += clf_3.predict(stack_tst) / 4
print('\nThe Bagging Loss', mse(df_trn['收率'].values, stack_oof))
del val_x, val_y, trn_x, trn_y, trn_idx, val_idx, cat_features
del params_lgb, params_xgb, fit_params
del pred_trn_xgb, pred_trn_lgb
del _fold, clf_3, stack_oof, stack_folds, stack_trn, stack_tst
del KFold, RepeatedKFold, BayesianRidge, trn, tst
gc.collect()

# pred_tst1 = pred_tst.copy()
# pred_tst['pred'] = pred_tst1['pred']*0.5 + pred_tst2['pred']*0.5
pred_tst.to_csv(f'submit/submit_{datetime.now().strftime("%m%d%H%M")}.csv',
                index=False,
                header=None)
def normalizedRMSE(real,predicted):

    return np.sqrt(mse(real,predicted))/(np.concatenate((real,predicted)).max() - np.concatenate((real,predicted)).min())
예제 #33
0
def create_prophet_m(app_name,z1,cpu_perc_list,delay=24):
    
    ### --- For realtime pred ---###
    
    full_df = z1.bw.iloc[0:len(z1)]
    full_df = full_df.reset_index()
    full_df.columns = ['ds','y']
    
    #removing outliers
    q50 = full_df.y.median()
    q100 = full_df.y.quantile(1)
    q75  = full_df.y.quantile(.75)
    
    if((q100-q50) >= (2*q50)):
        
        full_df.loc[full_df.y>=(2*q50),'y'] = None
    
    #-- Realtime prediction --##
    #model 
    model_r = Prophet(yearly_seasonality=False,changepoint_prior_scale=.1,seasonality_prior_scale=0.05)
    model_r.fit(full_df)

    cpu_perc_list.append(py.cpu_percent())
    cpu_perc_list = [max(cpu_perc_list)]

    future_r = model_r.make_future_dataframe(periods=delay,freq='D')
    forecast_r = model_r.predict(future_r)
    forecast_r.index = forecast_r['ds']
    #forecast 
    pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1)+delay)])
    pred_r=pred_r.reset_index()
    #--- completes realtime pred ---#
    
    train_end_index=len(z1.bw)-delay
    train_df=z1.bw.iloc[0:train_end_index]
    
    test_df=z1.bw.iloc[train_end_index:len(z1)]
    
    train_df=train_df.reset_index()
    test_df=test_df.reset_index()
    
    train_df.columns=['ds','y']
    
    #--- removing outliers in trainset  ---#
    
    q50 = train_df.y.median()
    q100 = train_df.y.quantile(1)
    q75  = train_df.y.quantile(.75)
    
    if((q100-q50) >= (2*q50)):
        
        train_df.loc[train_df.y>=(2*q50),'y'] = None
    
    test_df.columns=['ds','y']
    test_df['ds'] = pd.to_datetime(test_df['ds'])
   
    #model 
    model = Prophet(yearly_seasonality=False,changepoint_prior_scale=.1,seasonality_prior_scale=0.05)
    model.fit(train_df)

    cpu_perc_list.append(py.cpu_percent())
    cpu_perc_list = [max(cpu_perc_list)]


    future = model.make_future_dataframe(periods=len(test_df),freq='D')
    forecast = model.predict(future)
    forecast.index = forecast['ds']
    #forecast 
    pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)])
    
    print('length forecasted non realtime=',len(pred))
    pred=pred.reset_index()
    pred_df=pd.merge(test_df,pred,on='ds',how='left')
    
    pred_df.dropna(inplace=True)
    
    
    df=pd.DataFrame()
    
    if(len(pred_df)>0):
        
        pred_df['error_test']=pred_df.y-pred_df.yhat
    
        
    
        MSE=mse(pred_df.y,pred_df.yhat)
        RMSE=math.sqrt(MSE)
        pred_df['APE']=abs(pred_df.error_test*100/pred_df.y)
        MAPE=pred_df.APE.mean()
        min_error_rate = pred_df['APE'].quantile(0)/100
        max_error_rate = pred_df['APE'].quantile(1)/100
        median_error_rate = pred_df['APE'].quantile(.50)/100
        print("App name:",app_name)
        #print("MSE  :",MSE)
        print("RMSE :",RMSE)
        print("MAPE :",MAPE)
        
       
        mape_q98=pred_df['APE'][pred_df.APE<pred_df['APE'].quantile(0.98)].mean()
        std_MAPE = math.sqrt(((pred_df.APE-MAPE)**2).mean())

        df = pd.DataFrame({'length':len(z1),
                             'test_rmse':RMSE,
                             'test_mape':MAPE,
                             'std_mape':std_MAPE, #standerd deviation of mape
                             'min_error_rate':min_error_rate ,
                             'max_error_rate':max_error_rate ,
                             'median_error_rate':median_error_rate,
                 
                 'test_mape_98':mape_q98},
                   
                          index=[app_name])

    return(df,model,forecast,pred_df,pred_r)
예제 #34
0
# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.4, random_state=0)

reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(
    mse(reg.predict(X_test), y_test))

reg = LinearRegression()
reg.fit(X_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(
    mse(reg.predict(X_test), y_test))

results = {"Linear Regression": 0, "Decision Tree": 0}
예제 #35
0
    XvCV.append(X[va_idx])
    YtCV.append(Y[tr_idx])
    YvCV.append(Y[va_idx])

errTD = []
errVD = []
D = list(range(5, 60, 5))
for d in D:
    errti = []
    errvi = []
    for i in range(5):
        rfr = RFR(n_estimators=50, max_depth=d)
        rfr.fit(XtCV[0], YtCV[0])
        YtHat = rfr.predict(XtCV[0])
        YvHat = rfr.predict(XvCV[0])
        errti.append(mse(YtCV[0], YtHat))
        errvi.append(mse(YvCV[0], YvHat))
    errti = np.array(errti)
    errvi = np.array(errvi)
    errTD.append(np.mean(errti))
    errVD.append(np.mean(errvi))

#%%
plt.plot(D, errTD, '*-', label='Train Err')
plt.plot(D, errVD, '*-', label='Valid Err')
plt.legend()
plt.title('RandomForest Err vs MaxDepth')
plt.xticks(D, D)
plt.xlabel('depth')
plt.ylabel('err')
plt.savefig('rf_depth', dpi=2000)
예제 #36
0
def root_mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'):
    rmse = np.sqrt(mse(y_true, y_pred,
                       sample_weight=sample_weight, multioutput=multioutput))
    return rmse
예제 #37
0
def train(examples,
          labels,
          features=None,
          bucket_sizes=None,
          crosses=None,
          lr=1e-4,
          steps=100,
          batch_size=1,
          model=None):
    '''Create and train a linear regression model.

    Args:
      examples: pandas.DataFrame with examples
      labels: pandas.DataFrame with labels
      features: list of selected features from examples
      bucket_sizes: dict with size of buckets
      crosses: list of lists of features to be crossed
      lr: float, learning rate
      steps: int, number of steps to train
      batch_size: int, number of examples per batch
      model: tensorflow.estimator.LinearRegressor, previously trained model

    Returns:
      A trained tensorflow.estimator.LinearRegressor.
    '''

    # Create feature columns and dictionary mapping feature names to them.
    if not features:
        features = examples.columns
    fcdict = {
        feature: tf.feature_column.numeric_column(feature)
        for feature in features
    }
    fcs = fcdict.values()

    # Use buckets if bucket_sizes is specified.
    if bucket_sizes:
        if len(bucket_sizes) != len(features):
            raise ValueError(
                'The number of buckets must match the number of features.')

        fcdict = {
            feature: bucketize(examples[feature], fc, bucket_sizes[feature])
            if bucket_sizes[feature] else fc
            for feature, fc in fcdict.items()
        }

        fcs = fcdict.values()

    # Use crossed columns if crosses is specified.
    if crosses:
        for cross in crosses:
            cross_name = '_x_'.join(cross)
            cross_fc = [fcdict[feature] for feature in cross]
            fcdict[cross_name] = tf.feature_column.crossed_column(
                cross_fc, 1000)

        fcs = fcdict.values()

    ds = Ds.from_tensor_slices(
        ({feature: examples[feature]
          for feature in features}, labels))

    opt = tf.contrib.estimator.clip_gradients_by_norm(
        tf.train.FtrlOptimizer(learning_rate=lr), 5.0)

    if not model:
        model = tf.estimator.LinearRegressor(fcs, optimizer=opt)

    for _ in range(10):
        model.train(train_fn(ds, batch_size=batch_size), steps=steps // 10)
        predictions = get_predictions(model, ds)
        print("Mean squared error: ", mse(predictions, labels))

    return model
예제 #38
0
def rmse(y_test,y_predict):
    return np.sqrt(mse(y_test,y_predict))
예제 #39
0
def create_prophet_m(app_name, z1, delay=24):

    ### --- For realtime pred ---###

    full_df = z1.bw.iloc[0:len(z1)]
    full_df = full_df.reset_index()
    full_df.columns = ['ds', 'y']

    #removing outliers
    q50 = full_df.y.median()
    q100 = full_df.y.quantile(1)
    q75 = full_df.y.quantile(.75)

    if ((q100 - q50) >= (2 * q75)):

        full_df.loc[full_df.y >= (2 * q75), 'y'] = None

    #-- Realtime prediction --##
    #model
    model_r = Prophet(yearly_seasonality=False, changepoint_prior_scale=.2)
    model_r.fit(full_df)
    future_r = model_r.make_future_dataframe(periods=delay, freq='H')
    forecast_r = model_r.predict(future_r)
    forecast_r.index = forecast_r['ds']
    #forecast
    pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1) + delay)])
    pred_r = pred_r.reset_index()
    #--- completes realtime pred ---#

    train_end_index = len(z1.bw) - delay
    train_df = z1.bw.iloc[0:train_end_index]

    test_df = z1.bw.iloc[train_end_index:len(z1)]

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()
    train_df.columns = ['ds', 'y']

    #--- removing outliers in trainset  ---#

    q50 = train_df.y.median()
    q100 = train_df.y.quantile(1)
    q75 = train_df.y.quantile(.75)

    if ((q100 - q50) >= (2 * q75)):

        train_df.loc[train_df.y >= (2 * q75), 'y'] = None

    test_df.columns = ['ds', 'y']

    #model
    model = Prophet(yearly_seasonality=False, changepoint_prior_scale=.2)
    model.fit(train_df)
    future = model.make_future_dataframe(periods=len(test_df), freq='H')
    forecast = model.predict(future)
    forecast.index = forecast['ds']
    #forecast
    pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)])
    pred = pred.reset_index()
    pred_df = pd.merge(test_df, pred, on='ds', how='left')
    pred_df.dropna(inplace=True)

    df = pd.DataFrame()

    if (len(pred_df) > 0):

        pred_df['error_test'] = pred_df.y - pred_df.yhat

        MSE = mse(pred_df.y, pred_df.yhat)
        RMSE = math.sqrt(MSE)
        pred_df['APE'] = abs(pred_df.error_test * 100 / pred_df.y)
        MAPE = pred_df.APE.mean()
        min_error_rate = pred_df.quantile(0) / 100
        max_error_rate = pred_df.quantile(1) / 100
        median_error_rate = pred_df.quantile(.50) / 100
        print("App name:", app_name)
        #print("MSE  :",MSE)
        print("RMSE :", RMSE)
        print("MAPE :", MAPE)

        mape_q98 = pred_df['APE'][
            pred_df.APE < pred_df['APE'].quantile(0.98)].mean()
        std_MAPE = math.sqrt(((pred_df.APE - MAPE)**2).mean())

        df = pd.DataFrame(
            {
                'length': len(z1),
                'test_rmse': RMSE,
                'test_mape': MAPE,
                'std_mape': std_MAPE,  #standerd deviation of mape
                'min_error_rate': min_error_rate,
                'max_error_rate': max_error_rate,
                'median_error_rate': median_error_rate,
                'test_mape_98': mape_q98
            },
            index=[app_name])

    return (df, model, forecast, pred_df, pred_r)
예제 #40
0
    YvCV.append(Y0[va_idx])

#%% KNN method
K = [2**k for k in range(2, 13)]
errt = []
errv = []
for k in K:
    print("k=", k)
    errtk = 0
    errvk = 0
    knnL = neighbors.KNeighborsRegressor(k)
    for i in range(10):
        Xt, Yt = XtCV[i], YtCV[i]
        Xv, Yv = XvCV[i], YvCV[i]
        knnL.fit(Xt, Yt)
        errvi = mse(knnL.predict(Xv), Yv)
        errti = mse(knnL.predict(Xt), Yt)
        print("errt: %.5f\t" % errti, "errv: %.5f" % errvi)
        errtk += errti
        errvk += errvi
    errtk /= 10
    errvk /= 10
    errv.append(errvk)
    errt.append(errtk)

#%% KNN plot
plt.semilogx(K, np.array(errt) * 2, "*-", label="Train Err")
plt.semilogx(K, np.array(errv) * 2, "*-", label="Valid Err")
plt.xticks(K, K)
plt.title("KNN Err vs K")
plt.xlabel("k")
예제 #41
0
    print("shuffling data...")
    examples = list(zip(X, y))
    X, y = list(zip(*examples))
    X = np.array(X)
    y = np.array(y)
    del examples

    kf = KFold(n=len(X), n_folds=5, shuffle=True, random_state=np.random)
    train_index, test_index = next(iter(kf))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    del X
    del y

    print("fitting model...")
    mlp.fit(X_train, y_train)

    print("scoring model...")
    # print("predicted:", mlp.predict(X_test))
    # print("actual:", y_test)
    print("R^2 score =", mlp.score(X_test, y_test))
    y_pred = mlp.predict(X_test)
    print("MSE score =", mse(y_pred, y_test))
    print("MAE score =", mae(y_pred, y_test))
    print("accuracy_score =",
          accuracy_score([[round(y[0])] for y in y_pred], y_test))

    fn = os.path.join(settings['data-base'], 'nn_tanh3.pickle')
    pickle.dump(mlp, open(fn, 'wb'))
예제 #42
0
def RMSE(y2_test, y_predict):
    return np.sqrt(mse(y2_test, y_predict))
data_diff = data.diff(periods = 1)
data_diff = data_diff[1:]

### ACF and PACF for differenced Time Series ###
plot_acf(data_diff)
plot_pacf(data_diff)

### Stationarity Conversion ###

second_diff = data.diff(periods = 2)
second_diff = second_diff[2:]

### ACF and PACF for second differenced Time Series ###
plot_acf(second_diff)
plot_pacf(second_diff)

# ARMA Model Selection (Second Differencing) #
for i in range(9):
    train = data.values
    test = test_data.values
    model_arima = ARIMA(train,order=(9-i,2,2))
    model_arima_fit = model_arima.fit()
    print(model_arima_fit.summary())
    print('#############################################################################\n')

chosen_model = ARIMA(train,order=(1,2,2))
chosen_model_fit = chosen_model.fit()
forecast = chosen_model_fit.forecast(steps = len(test))
rmse_forc = mse(forecast[0],test,squared = False)
print('RMSE: ',rmse_forc)
									
# Make predictions with the best weights
deeper=True
wider=False
dropout=0.5
learning_Rate = 0.001
# Need to rebuild model in case it is different from the model that was trained most recently.
model = build_model()

model.load_weights('question_pairs_weights_deeper={}_wider={}_lr={}_dropout={}.h5'.format(
                    deeper,wider,learning_rate,dropout))
predictions = model.predict([x_test,x_test], verbose = True)


# Compare testing loss to training and validating loss
print("mse "+str(mse(y_test, predictions)))


max_price=82.7999880000001
min_price=-111.34997599999997
# In[314]:

rk=y_test.values
#print(len(rk))
normpreds=pd.read_csv("input/preds.csv")
unnorm_preds=normpreds.values


def unnormalize(price):
    price = price*(max_price-min_price)+min_price
    return(price)
예제 #45
0
def lasso_pred(x, y):

    lasso = Lasso(normalize="True", tol=1e-35, max_iter=5000)
    w, q = x.shape

    coefs = []
    preds = []
    alpha_min = 0
    alpha_max = 0
    i = 0

    while i < q:

        start = timer()
        print("\n------------------------------------\n")
        name = "Fitting for country no.: %s" % (i + 1)
        print(name)

        alpha_min = 0.1
        alpha_max = 10e30
        alpha_avg = (alpha_min + alpha_max) / 2
        lasso.set_params(alpha=alpha_avg)
        population_vector = x.iloc[:, i]
        x.iloc[:, i] = np.zeros(w)
        y_true = y.iloc[:, i]
        lasso.fit(x, population_vector)
        prediction = lasso.predict(y)
        error = mse(y_true, prediction)
        x.iloc[:, i] = population_vector

        for j in range(100):

            #if(i == 60):
            #print(alpha_min, alpha_max, np.count_nonzero(lasso.coef_))

            if np.count_nonzero(lasso.coef_) > 5 or np.count_nonzero(
                    lasso.coef_) < 5:

                if np.count_nonzero(lasso.coef_) > 5:

                    alpha_min = alpha_avg
                    alpha_avg = (alpha_min + alpha_max) / 2
                    lasso.set_params(alpha=alpha_avg)
                    population_vector = x.iloc[:, i]
                    x.iloc[:, i] = np.zeros(w)
                    y_true = y.iloc[:, i]
                    lasso.fit(x, population_vector)
                    prediction = lasso.predict(y)
                    mean_error = mse(y_true, prediction)
                    x.iloc[:, i] = population_vector

                if np.count_nonzero(lasso.coef_) < 5:

                    alpha_max = alpha_avg
                    alpha_avg = (alpha_min + alpha_max) / 2
                    lasso.set_params(alpha=alpha_avg)
                    population_vector = x.iloc[:, i]
                    x.iloc[:, i] = np.zeros(w)
                    y_true = y.iloc[:, i]
                    lasso.fit(x, population_vector)
                    prediction = lasso.predict(y)
                    mean_error = mse(y_true, prediction)
                    x.iloc[:, i] = population_vector

            else:
                break

        if np.count_nonzero(lasso.coef_) > 5:

            alpha_min = 1
            alpha_max = 10e20
            alpha_avg = (alpha_min + alpha_max) / 2
            lasso.set_params(alpha=alpha_avg,
                             normalize="True",
                             tol=1e-20,
                             max_iter=50000)
            population_vector = x.iloc[:, i]
            x.iloc[:, i] = np.zeros(w)
            y_true = y.iloc[:, i]
            lasso.fit(x, population_vector)
            prediction = lasso.predict(y)
            error = mse(y_true, prediction)
            x.iloc[:, i] = population_vector

            for j in range(150):

                if np.count_nonzero(lasso.coef_) > 5 or np.count_nonzero(
                        lasso.coef_) < 5:

                    if np.count_nonzero(lasso.coef_) > 5:
                        alpha_min = alpha_avg
                        alpha_avg = (alpha_min + alpha_max) / 2
                        lasso.set_params(alpha=alpha_avg,
                                         normalize="True",
                                         tol=1e-20,
                                         max_iter=50000)
                        population_vector = x.iloc[:, i]
                        x.iloc[:, i] = np.zeros(w)
                        y_true = y.iloc[:, i]
                        lasso.fit(x, population_vector)
                        prediction = lasso.predict(y)
                        mean_error = mse(y_true, prediction)
                        x.iloc[:, i] = population_vector

                    if np.count_nonzero(lasso.coef_) < 5:
                        alpha_max = alpha_avg
                        alpha_avg = (alpha_min + alpha_max) / 2
                        lasso.set_params(alpha=alpha_avg,
                                         normalize="True",
                                         tol=1e-20,
                                         max_iter=50000)
                        population_vector = x.iloc[:, i]
                        x.iloc[:, i] = np.zeros(w)
                        y_true = y.iloc[:, i]
                        lasso.fit(x, population_vector)
                        prediction = lasso.predict(y)
                        mean_error = mse(y_true, prediction)
                        x.iloc[:, i] = population_vector
                else:
                    break

        assert np.count_nonzero(lasso.coef_) <= 5, "too many non-zero"

        if (mean_error < error):
            coefs_best = lasso.coef_
            pred_best = prediction
            error = mean_error
            alpha_value = alpha_avg
            country_number = np.count_nonzero(lasso.coef_)

        coefs.append(coefs_best)
        preds.append(pred_best)

        end_timer = timer() - start

        time_statement = "Fitting time for country no. %s: " % (i + 1)
        print(time_statement + str(round(end_timer, 3)) + " seconds.")
        print("Alpha value: " + str(alpha_value))
        print("Number of countries used for fitting: " + str(country_number))

        i = i + 1

    return coefs, preds
#plot y_test vs y_pred
plt.scatter(y_test,predictions)    
plt.xlim([3, 5])
plt.ylim([3, 5]) 
plt.xlabel("Test Data", fontsize=16)  
plt.ylabel("Predictions", fontsize=16)

np.subtract(predictions,y_test)

       
#check accruacy
print("Training set score: {:.3f}".format(lm.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lm.score(X_test, y_test)))

#Get MSE
mse(y_test,predictions)


#DETERMINE OUR APP RATING BASED ON OUR CURRENT CONTENT RATING AND PRICE

#Calling googleplaystore sheet that has our app data -- will need to update this to where it's saved on your drive
xls = pd.ExcelFile(r'C:\Users\206581774\Documents\DataScienceClass\google_play_store.xlsx')
df_app = pd.read_excel(xls, 'app')

#assign x and y
x_test = df_app.iloc[:,[3,4]]
print(x_test)
y_test = df_app.iloc[:,2]
print(y_test)

#predict our app score
예제 #47
0
def rmse(y_true, y_pred):
    return mse(y_true, y_pred)**0.5
예제 #48
0
    w, losses, residuals = gd(train_X.iloc[train_idx],
                              train_y.iloc[train_idx],
                              train_X.iloc[test_idx],
                              train_y.iloc[test_idx],
                              iterations=its,
                              learning_rate=1e-3,
                              lr_dampening=0.999,
                              reg=1e-7)

    mean_losses = mean_losses + (1 / 5) * losses
    mean_residuals = mean_residuals + (1 / 5) * residuals

    xval_preds = w[0] + np.dot(test_X, w[1:])

    xval_losses_residuals[fold_it] = [
        mse(test_y, xval_preds),
        mae(test_y, xval_preds)
    ]

    fold_it = fold_it + 1

print('Cross-validation (a) loss: %.2f +- %.2f (b) residual: %.2f +- %.2f' % (
    np.mean(xval_losses_residuals[:, 0]),
    np.std(xval_losses_residuals[:, 0]),
    np.mean(xval_losses_residuals[:, 1]),
    np.std(xval_losses_residuals[:, 1]),
))

# Final model
w, final_losses, final_residuals = gd(train_X,
                                      train_y,
def reg_evaluation(
        ori_train_price,
        ori_test_price,
        pred_train_price,
        pred_test_price,  # origin price
        y_train,
        train_pred,
        y_test,
        test_pred,
        price_split,
        print_result=True):

    under_train = (ori_train_price <= price_split).nonzero()[0]
    above_train = (ori_train_price > price_split).nonzero()[0]
    under_test = (ori_test_price <= price_split).nonzero()[0]
    above_test = (ori_test_price > price_split).nonzero()[0]

    if print_result:
        print("-" * 50)
        print("For All Price")

        print("Train Result ----------")
        get_max_min_percentage_diff(ori_train_price, pred_train_price)
        print("RMSLE is ", mse(y_train, train_pred, squared=False))
        print("R^2  is ", r2(y_train, train_pred))
        print("Mean Absolute Percentage Error is ",
              mape(ori_train_price, pred_train_price))
        print("Mean Absolute Error is ", mae(ori_train_price,
                                             pred_train_price))

        print("\nTest Result ----------")
        get_max_min_percentage_diff(ori_test_price, pred_test_price)
        print("RMSLE is ", mse(y_test, test_pred, squared=False))
        print("R^2 is ", r2(y_test, test_pred))
        print("Mean Absolute Percentage Error is ",
              mape(ori_test_price, pred_test_price))
        print("Mean Absolute Error is ", mae(ori_test_price, pred_test_price))

        print("-" * 50)
        print("For price under $%d" % price_split)

        print("Train Result ----------")
        get_max_min_percentage_diff(ori_train_price[under_train],
                                    pred_train_price[under_train])
        knn_skb_select_train_msle = mse(y_train[under_train],
                                        train_pred[under_train],
                                        squared=False)
        print("RMSLE is ", knn_skb_select_train_msle)
        print("R^2  is ", r2(y_train[under_train], train_pred[under_train]))
        print(
            "Mean Absolute Percentage Error is ",
            mape(ori_train_price[under_train], pred_train_price[under_train]))
        print("Mean Absolute Error is ",
              mae(ori_train_price[under_train], pred_train_price[under_train]))

        print("\nTest Result ----------")
        get_max_min_percentage_diff(ori_test_price[under_test],
                                    pred_test_price[under_test])
        knn_skb_select_test_msle = mse(y_test[under_test],
                                       test_pred[under_test],
                                       squared=False)
        print("RMSLE is ", knn_skb_select_test_msle)
        print("R^2 is ", r2(y_test[under_test], test_pred[under_test]))
        print("Mean Absolute Percentage Error is ",
              mape(ori_test_price[under_test], pred_test_price[under_test]))
        print("Mean Absolute Error is ",
              mae(ori_test_price[under_test], pred_test_price[under_test]))

        print("-" * 50)
        print("For price above $%d" % price_split)

        print("Train Result ----------")
        get_max_min_percentage_diff(ori_train_price[above_train],
                                    pred_train_price[above_train])
        knn_skb_select_train_msle = mse(y_train[above_train],
                                        train_pred[above_train],
                                        squared=False)
        print("RMSLE is ", knn_skb_select_train_msle)
        print("R^2  is ", r2(y_train[above_train], train_pred[above_train]))
        print(
            "Mean Absolute Percentage Error is ",
            mape(ori_train_price[above_train], pred_train_price[above_train]))
        print("Mean Absolute Error is ",
              mae(ori_train_price[above_train], pred_train_price[above_train]))

        print("\nTest Result ----------")
        get_max_min_percentage_diff(ori_test_price[above_test],
                                    pred_test_price[above_test])
        knn_skb_select_test_msle = mse(y_test[above_test],
                                       test_pred[above_test],
                                       squared=False)
        print("RMSLE is ", knn_skb_select_test_msle)
        print("R^2 is ", r2(y_test[above_test], test_pred[above_test]))
        print("Mean Absolute Percentage Error is ",
              mape(ori_test_price[above_test], pred_test_price[above_test]))
        print("Mean Absolute Error is ",
              mae(ori_test_price[above_test], pred_test_price[above_test]))

    plot_prediction_price(
        ori_train_price[under_test],
        pred_train_price[under_test],
        title="Predict Price for Item in Train Set with Price <= %d" %
        price_split)
    plot_prediction_price(
        ori_train_price[above_test],
        pred_train_price[above_test],
        title="Predict Price for Item in Train Set with Price > %d" %
        price_split)

    plot_prediction_price(
        ori_test_price[under_test],
        pred_test_price[under_test],
        title="Predict Price for Item in Test Set with Price <= %d" %
        price_split)
    plot_prediction_price(
        ori_test_price[above_test],
        pred_test_price[above_test],
        title="Predict Price for Item in Test Set with Price > %d" %
        price_split)

    all_temp = []
    for indexes in [None, under_train, above_train]:
        if indexes is None:
            oy, py = ori_train_price, pred_train_price
            oly, ply = y_train, train_pred
        else:
            oy, py = ori_train_price[indexes], pred_train_price[indexes]
            oly, ply = y_train[indexes], train_pred[indexes]
        all_temp.append([
            *get_max_min_percentage_diff(oy, py),
            mse(oly, ply, squared=False),
            r2(oly, ply),
            mape(oy, py),
            mae(oy, py)
        ])

    for indexes in [None, under_test, above_test]:
        if indexes is None:
            oy, py = ori_test_price, pred_test_price
            oly, ply = y_test, test_pred
        else:
            oy, py = ori_test_price[indexes], pred_test_price[indexes]
            oly, ply = y_test[indexes], test_pred[indexes]
        all_temp.append([
            *get_max_min_percentage_diff(oy, py),
            mse(oly, ply, squared=False),
            r2(oly, ply),
            mape(oy, py),
            mae(oy, py)
        ])

    result_df = pd.DataFrame(np.array(all_temp),
                             columns=[
                                 'Max Percentage Diff', 'Min Percentage Diff',
                                 'RMSLE', 'R^2', 'MAPE', 'MAE'
                             ],
                             index=[
                                 'All Train',
                                 'Train with Price <= %d' % price_split,
                                 'Train with Price > %d' % price_split,
                                 'All Test',
                                 'Test with Price <= %d' % price_split,
                                 'Test with Price > %d' % price_split,
                             ])
    return result_df
예제 #50
0
def trade_agent(lock, number_workers):

    # ENDLESS CYCLE for continuous exploration of window time, learning and trading
    while True:
        # for _ in range(2):

        class_wtupdate = Window_time_update(lock, number_workers)
        rmse = lambda y_true, y_pred: np.sqrt(mse(y_true, y_pred))
        api_key = '......................................................'
        api_secret = '......................................................'
        client = Client(api_key, api_secret)
        risk = 0.2
        global_episodes = 10000
        num_episodes = 10
        h_size = 512

        with lock:
            max_time_window_sec = np.load(
                'weights_biases_numpy_arrays/max_time_window_sec.npy',
                allow_pickle=True)

        PATH_final_weights_LSTM = 'weights_biases_numpy_arrays/final_weights_biases/'

        state = (np.zeros([1, h_size]), np.zeros([1, h_size]))
        state_time = (np.zeros([1, h_size]), np.zeros([1, h_size]))

        commission = 0.001  #  # 0.00075

        list_predicts = []
        list_predicts_time = []
        list_date_time = []

        list_price_in_state = []

        total_steps = 0
        global_steps = 0

        for _ in range(global_episodes):

            genetic_modification(lock)

            tf.reset_default_graph()

            weights = {  ### mine ###
                # 5x5 filter_size, 3 channel, 32 num_filters
                'w_conv1':
                weight_variable(lock,
                                shape=[5, 5, 3, 32],
                                name='w_conv1',
                                first_generation=False),
                # 4x4 filter_size, 32 channel, 64 num_filters
                'w_conv2':
                weight_variable(lock,
                                shape=[4, 4, 32, 64],
                                name='w_conv2',
                                first_generation=False),
                # 3x3 filter_size, 64 channel, 64 num_filters
                'w_conv3':
                weight_variable(lock,
                                shape=[3, 3, 64, 64],
                                name='w_conv3',
                                first_generation=False),
                # 3x3 filter_size, 64 channel, 512 num_filters
                'w_conv4':
                weight_variable(lock,
                                shape=[5, 5, 64, 512],
                                name='w_conv4',
                                first_generation=False),
                # fully connected, 512 inputs, 1 outputs
                'w_predict':
                weight_variable(lock,
                                shape=[512, 1],
                                name='w_predict',
                                first_generation=False),

                ### time ###

                # 5x5 filter_size, 3 channel, 32 num_filters
                'w_conv1_time':
                weight_variable(lock,
                                shape=[5, 5, 3, 32],
                                name='w_conv1_time',
                                first_generation=False),
                # 4x4 filter_size, 32 channel, 64 num_filters
                'w_conv2_time':
                weight_variable(lock,
                                shape=[4, 4, 32, 64],
                                name='w_conv2_time',
                                first_generation=False),
                # 3x3 filter_size, 64 channel, 64 num_filters
                'w_conv3_time':
                weight_variable(lock,
                                shape=[3, 3, 64, 64],
                                name='w_conv3_time',
                                first_generation=False),
                # 3x3 filter_size, 64 channel, 512 num_filters
                'w_conv4_time':
                weight_variable(lock,
                                shape=[5, 5, 64, 512],
                                name='w_conv4_time',
                                first_generation=False),
                # fully connected, 512 inputs, 1 outputs
                'w_predict_time':
                weight_variable(lock,
                                shape=[512, 1],
                                name='w_predict_time',
                                first_generation=False)
            }

            biases = {  ### mine ###
                'b_conv1':
                bias_variable(lock,
                              shape=[32],
                              name='b_conv1',
                              first_generation=False),
                'b_conv2':
                bias_variable(lock,
                              shape=[64],
                              name='b_conv2',
                              first_generation=False),
                'b_conv3':
                bias_variable(lock,
                              shape=[64],
                              name='b_conv3',
                              first_generation=False),
                'b_conv4':
                bias_variable(lock,
                              shape=[512],
                              name='b_conv4',
                              first_generation=False),

                ### time ###
                'b_conv1_time':
                bias_variable(lock,
                              shape=[32],
                              name='b_conv1_time',
                              first_generation=False),
                'b_conv2_time':
                bias_variable(lock,
                              shape=[64],
                              name='b_conv2_time',
                              first_generation=False),
                'b_conv3_time':
                bias_variable(lock,
                              shape=[64],
                              name='b_conv3_time',
                              first_generation=False),
                'b_conv4_time':
                bias_variable(lock,
                              shape=[512],
                              name='b_conv4_time',
                              first_generation=False)
            }

            cell_mainN = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,
                                                      state_is_tuple=True)

            cell_timeN = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,
                                                      state_is_tuple=True)

            mainN = mainNetwork(h_size, cell_mainN, weights, biases)

            timeN = timeNetwork(h_size, cell_timeN, weights, biases)

            init = tf.global_variables_initializer()

            with tf.Session() as sess:

                sess.run(init)

                ### mainN ###
                ### timeN ###
                with lock:
                    weights_np_LSTM_mainQN = np.load(PATH_final_weights_LSTM +
                                                     'LSTM_weights_biases.npy',
                                                     allow_pickle=True)
                    weights_np_LSTM_timeQN = np.load(
                        PATH_final_weights_LSTM +
                        'LSTM_weights_biases_time.npy',
                        allow_pickle=True)

                ### mainN ###
                ### timeN ###
                cell_mainN.set_weights(weights_np_LSTM_mainQN)
                cell_timeN.set_weights(weights_np_LSTM_timeQN)

                for _ in range(num_episodes):

                    total_steps += 1

                    state_exchange, best_ask_bid_volume_price = state_environment_3D(
                    )
                    price_in_state = last_price()

                    Previous_price_global = price_in_state

                    if len(list_price_in_state) == num_episodes:
                        list_price_in_state[0:1] = []
                        list_price_in_state.append(price_in_state)

                    else:
                        list_price_in_state.append(price_in_state)

                    prediction, state1 = sess.run(
                        [mainN.predict, mainN.rnn_state],
                        feed_dict={
                            mainN.rawInput: [state_exchange],
                            mainN.trainLength: 1,
                            mainN.state_in: state,
                            mainN.batch_size: 1
                        })

                    state = state1

                    predict_change_price = float(prediction)

                    if len(list_predicts) == num_episodes:
                        list_predicts[0:1] = []
                        list_predicts.append(predict_change_price)

                    else:
                        list_predicts.append(predict_change_price)

                    prediction_time, state1_time = sess.run(
                        [timeN.predict_time, timeN.rnn_state_time],
                        feed_dict={
                            timeN.rawInput_time: [state_exchange],
                            timeN.trainLength_time: 1,
                            timeN.state_in_time: state_time,
                            timeN.batch_size_time: 1
                        })

                    state_time = state1_time

                    pred_time = float(prediction_time)

                    if len(list_predicts_time) == num_episodes:
                        list_predicts_time[0:1] = []
                        list_date_time[0:1] = []
                        list_predicts_time.append(pred_time)
                        list_date_time.append(datetime.datetime.now())

                    else:
                        list_predicts_time.append(pred_time)
                        list_date_time.append(datetime.datetime.now())

                    if global_steps > 0:

                        signal_for_trade, error_online = final_metric(
                            list_price_in_state, list_predicts, risk,
                            commission)

                        class_wtupdate.time_update(signal=signal_for_trade)

                        adjusted_predict_change_price = predict_change_price - error_online if \
                            predict_change_price >= 1 else predict_change_price + error_online

                        trend_matching = int(predict_change_price) == int(
                            adjusted_predict_change_price)

                        ### SIGNAL and TREND ###
                        if signal_for_trade and trend_matching:

                            predict_change_price = adjusted_predict_change_price

                            action_predict = ['sell',
                                              'buy'][int(predict_change_price)]

                            money_predict = float(
                                client.get_asset_balance(
                                    asset='USDT').get('free'))

                            bitcoin_predict = float(
                                client.get_asset_balance(
                                    asset='BTC').get('free')) * price_in_state

                            if money_predict >= 10 and action_predict == 'buy' and predict_change_price - 1 > commission:

                                if best_ask_bid_volume_price.get('volume').get(
                                        'buy'
                                )[0] * price_in_state > money_predict:

                                    quantity_BTC = price_format_conversion(
                                        money_predict / price_in_state, 6)

                                    order = client.order_market_buy(
                                        symbol='BTCUSDT',
                                        quantity=quantity_BTC)

                                    quantity_true_action_predict += 1

                                else:

                                    best_quantity_BTC = str(
                                        best_ask_bid_volume_price.get(
                                            'volume').get('buy')[0])

                                    order = client.order_market_buy(
                                        symbol='BTCUSDT',
                                        quantity=best_quantity_BTC)

                                    quantity_true_action_predict += 1

                                    for i in range(1, 10):

                                        money_predict = float(
                                            client.get_asset_balance(
                                                asset='USDT').get('free'))

                                        if money_predict >= 10:

                                            predict_change_price_next = (
                                                (price_in_state *
                                                 predict_change_price) /
                                                best_ask_bid_volume_price.get(
                                                    'price').get('buy')[i])

                                            if predict_change_price_next - 1 > commission:

                                                volume_action_money_predict = (
                                                    best_ask_bid_volume_price.
                                                    get('volume').get('buy')[i]
                                                    *
                                                    best_ask_bid_volume_price.
                                                    get('price').get('buy')[i])

                                                if volume_action_money_predict >= money_predict:

                                                    best_quantity_BTC = price_format_conversion(
                                                        money_predict /
                                                        price_in_state, 6)

                                                    order = client.order_market_buy(
                                                        symbol='BTCUSDT',
                                                        quantity=
                                                        best_quantity_BTC)

                                                    quantity_true_action_predict += 1

                                                else:

                                                    best_quantity_BTC = str(
                                                        best_ask_bid_volume_price
                                                        .get('volume').get(
                                                            'buy')[i])

                                                    order = client.order_market_buy(
                                                        symbol='BTCUSDT',
                                                        quantity=
                                                        best_quantity_BTC)

                                                    quantity_true_action_predict += 1

                                            else:

                                                break

                            ############# bitcoin_predict ################

                            if bitcoin_predict >= 10 and action_predict == 'sell' and 1 - predict_change_price > commission:

                                if best_ask_bid_volume_price.get('volume').get(
                                        'sell'
                                )[0] * price_in_state > bitcoin_predict:

                                    quantity_BTC = price_format_conversion(
                                        bitcoin_predict / price_in_state, 6)

                                    order = client.order_market_sell(
                                        symbol='BTCUSDT',
                                        quantity=quantity_BTC)

                                    quantity_true_action_predict += 1

                                else:

                                    best_quantity_BTC = str(
                                        best_ask_bid_volume_price.get(
                                            'volume').get('sell')[0])

                                    order = client.order_market_sell(
                                        symbol='BTCUSDT',
                                        quantity=best_quantity_BTC)

                                    quantity_true_action_predict += 1

                                    for i in range(1, 10):

                                        bitcoin_predict = float(
                                            client.get_asset_balance(
                                                asset='BTC').get(
                                                    'free')) * price_in_state

                                        if bitcoin_predict >= 10:

                                            predict_change_price_next = (
                                                (price_in_state *
                                                 predict_change_price) /
                                                best_ask_bid_volume_price.get(
                                                    'price').get('sell')[i])

                                            if 1 - predict_change_price_next > commission:

                                                volume_action_bitcoin_predict = (
                                                    best_ask_bid_volume_price.
                                                    get('volume').get(
                                                        'sell')[i] *
                                                    best_ask_bid_volume_price.
                                                    get('price').get('sell')[i]
                                                )

                                                if volume_action_bitcoin_predict >= bitcoin_predict:

                                                    best_quantity_BTC = price_format_conversion(
                                                        bitcoin_predict /
                                                        price_in_state, 6)

                                                    order = client.order_market_sell(
                                                        symbol='BTCUSDT',
                                                        quantity=
                                                        best_quantity_BTC)

                                                    quantity_true_action_predict += 1

                                                else:

                                                    best_quantity_BTC = str(
                                                        best_ask_bid_volume_price
                                                        .get('volume').get(
                                                            'sell')[i])

                                                    order = client.order_market_sell(
                                                        symbol='BTCUSDT',
                                                        quantity=
                                                        best_quantity_BTC)

                                                    quantity_true_action_predict += 1

                                            else:

                                                break

                        list_pred_abs_price = [
                            list_predicts[index] * price
                            for index, price in enumerate(list_price_in_state)
                        ]

                        list_pred_abs_time = [
                            max_time_window_sec * t for t in list_predicts_time
                        ]

                        list_date_time_for_pred = list_date_time[:]

                        list_date_time_for_pred.append(
                            list_date_time[-1] + datetime.timedelta(
                                seconds=(list_pred_abs_time[-1])))

                        rmse_online = rmse(
                            list_price_in_state[1:],
                            list_pred_abs_price[:len(list_price_in_state[1:])])

                        plt.ion()
                        plt.gca().cla()

                        plt.subplots_adjust(bottom=0.2)
                        plt.xticks(rotation=25)
                        ax = plt.gca()
                        ax.set_xticks(list_date_time_for_pred[1:])

                        xfmt = md.DateFormatter('%H:%M:%S')
                        ax.xaxis.set_major_formatter(xfmt)

                        plt.plot(list_date_time_for_pred[1:],
                                 list_pred_abs_price,
                                 linewidth=3,
                                 linestyle="--",
                                 color="blue",
                                 marker='o',
                                 label=r"Predicted price")
                        plt.plot(list_date_time[1:],
                                 list_price_in_state[1:],
                                 linewidth=3,
                                 linestyle="-",
                                 color="red",
                                 marker='o',
                                 label=r"True price")
                        plt.xlabel(r"Agent Predicted Time (seconds)")
                        plt.ylabel(r"Predicted and true price (US$)")
                        plt.title(
                            f'Real time trading. RMSE online = {rmse_online.round(2)} US$. SIGNAL = {signal_for_trade}'
                        )

                        plt.legend(loc="upper left")
                        plt.pause(0.1)
                        plt.show()

                    time.sleep(int(abs(pred_time)) * max_time_window_sec)

            global_steps += 1

            with lock:
                stop_train_trade_signal = np.load(
                    'weights_biases_numpy_arrays/stop_train_trade.npy',
                    allow_pickle=True)
            if stop_train_trade_signal:
                break

        print('total_steps_trade =', total_steps)
예제 #51
0
result1 = mlp1.predict(df1[['x','y']][8:10])
result2 = mlp2.predict(df2[['a','b']][8:10])

dif1 = abs(df1['z'][8:10] - abs(result1))/df1['z'][8:10]
mymape1 = 100/len(result1) * dif1.sum()

dif2 = abs(df2['c'][8:10] - abs(result2))/df2['c'][8:10]
mymape2 = 100/len(result2) * dif2.sum()


print('mymape1', mymape1)
print('mymape2', mymape2)

mae1 = mae(df1['z'][8:10], result1)
mape1 = 100*mae1

mae2 = mae(df2['c'][8:10], result2)
mape2 = 100*mae2

print('mape1', mape1)
print('mape2', mape2)

print('mae1', mae1)
print('mae2', mae2)

mse1 = mse(df1['z'][8:10], result1)
mse2 = mse(df2['c'][8:10], result2)

print('mse1', mse1)
print('mse2', mse2)
예제 #52
0
def main():

    df = pd.read_csv("dataBefore_5000.csv")

    print(df.head())

    X = df[[
        "calls_up", "calls_down", "starts_up", "starts_down", "availability",
        "alarms"
    ]]
    print(X)
    print(X.shape)
    TARGETS = df[["ave_callTime"]]
    print(TARGETS)

    # Display raw data
    plt.figure(1)
    plt.subplot(2, 1, 1)
    plt.scatter(X.calls_up,
                TARGETS,
                c='b',
                s=20,
                alpha=0.5,
                label='call_up [#]')
    #plt.xlabel("[#]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs calls up")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.subplot(2, 1, 2)
    plt.scatter(X.calls_down,
                TARGETS,
                c='r',
                s=20,
                alpha=0.5,
                label='call_down [#]')
    #plt.xlabel("[#]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs calls down")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)

    plt.figure(2)
    plt.subplot(2, 1, 1)
    plt.scatter(X.starts_up,
                TARGETS,
                c='c',
                s=20,
                alpha=0.5,
                label='starts_up [#]')
    #plt.xlabel("[#]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs starts up")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.subplot(2, 1, 2)
    plt.scatter(X.starts_down,
                TARGETS,
                c='m',
                s=20,
                alpha=0.5,
                label='starts_down [#]')
    #plt.xlabel("[#]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs starts down")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)

    plt.figure(3)
    plt.subplot(2, 1, 1)
    plt.scatter(X.availability,
                TARGETS,
                c='g',
                s=20,
                alpha=0.5,
                label='availability [%]')
    #plt.xlabel("[%]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs availability")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.subplot(2, 1, 2)
    plt.scatter(X.alarms, TARGETS, c='k', s=20, alpha=0.5, label='alarms [#]')
    #plt.xlabel("[#]")
    plt.ylabel("Call time [s]")
    #plt.title("Call time vs alarms")
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)

    # Apply Linear Regression Model
    lm = LinearRegression()
    lm.fit(X, TARGETS.ave_callTime)

    plt.figure(4)
    plt.scatter(TARGETS, lm.predict(X), c='b', s=30, alpha=0.5)
    plt.xlabel("Measured call time [s]")
    plt.ylabel("Predicted call time [s]")
    plt.title("Predicted vs Measured (Linear Regression)")
    x = [0, 50]
    y = x
    lines = plt.plot(x, y)
    plt.setp(lines, color='k', linewidth=2.0)
    plt.xlim((0, 50))
    plt.ylim((0, 50))

    print("Linear regression model \n  Before 5000 dataset")
    print(
        "  Regresion coefficients: \n  [calls_up, calls_down, starts_up, starts_down, availability, alarms] \n    =",
        lm.coef_)
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =", round(mse(TARGETS, lm.predict(X)), 5))
    print("  R2 score =", round(r2_score(TARGETS, lm.predict(X)), 5))

    sampleId = np.linspace(1, 5000, 5000)
    #print(sampleId)
    #print(sampleId.shape)

    plt.figure(5)
    plt.subplot(2, 1, 1)
    plt.plot(sampleId,
             lm.predict(X),
             'go',
             markersize=2,
             alpha=0.5,
             label='Predicted call time [s] vs Id \nLinear Regression')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 5000))
    plt.ylim((0, 50))
    plt.subplot(2, 1, 2)
    plt.plot(sampleId,
             TARGETS,
             'bo',
             markersize=2,
             alpha=0.5,
             label='Measured call time [s] vs Id \nRaw data')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 5000))
    plt.ylim((0, 50))

    # Check the model with a single parameter
    lm = LinearRegression()
    lm.fit(X[['calls_up']], TARGETS)
    print("  *****")
    print("  calls_up")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['calls_up']])), 3))
    print("  R2 score =",
          round(r2_score(TARGETS, lm.predict(X[['calls_up']])), 3))
    lm = LinearRegression()
    lm.fit(X[['calls_down']], TARGETS)
    print("  calls_down")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['calls_down']])), 3))
    print("  R2 score =",
          round(r2_score(TARGETS, lm.predict(X[['calls_down']])), 3))
    lm = LinearRegression()
    lm.fit(X[['starts_up']], TARGETS)
    print("  starts_up")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['starts_up']])), 3))
    print("  R2 score =",
          round(r2_score(TARGETS, lm.predict(X[['starts_up']])), 3))
    lm = LinearRegression()
    lm.fit(X[['starts_down']], TARGETS)
    print("  starts_down")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['starts_down']])), 3))
    print("  R2 score =",
          round(r2_score(TARGETS, lm.predict(X[['starts_down']])), 3))
    lm = LinearRegression()
    lm.fit(X[['availability']], TARGETS)
    print("  availability")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['availability']])), 3))
    print("  R2 score =",
          round(r2_score(TARGETS, lm.predict(X[['availability']])), 3))
    lm = LinearRegression()
    lm.fit(X[['alarms']], TARGETS)
    print("  alarms")
    print("  Regresion coefficients =", lm.coef_[0])
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error =",
          round(mse(TARGETS, lm.predict(X[['alarms']])), 3))
    print("  R2 score =", round(r2_score(TARGETS, lm.predict(X[['alarms']])),
                                3))
    print("  *****")

    # Divide dataset randomly. Use train_test_sprit
    X_train, X_test, Y_train, Y_test = cv.train_test_split(X,
                                                           TARGETS,
                                                           test_size=0.4,
                                                           random_state=5)
    print("  X_train", X_train.shape)
    print("  X_test", X_test.shape)
    print("  Y_train", Y_train.shape)
    print("  Y_test", Y_test.shape)

    lm = LinearRegression()
    lm.fit(X_train, Y_train)
    pred_train = lm.predict(X_train)
    pred_test = lm.predict(X_test)

    print("  Train and test dataset")
    print(
        "  Regression coefficients: \n  [calls_up, calls_down, starts_up, starts_down, availability, alarms] \n    =",
        lm.coef_)
    print("  Regression intercept =", lm.intercept_)
    print("  Mean squared error with X_train and Y_train =",
          round(mse(Y_train, lm.predict(X_train)), 5))
    print("  R2 score with X_train and Y_train =",
          round(r2_score(Y_train, lm.predict(X_train)), 3))
    print("  Mean squared error with X_test and Y_test =",
          round(mse(Y_test, lm.predict(X_test)), 5))
    print("  R2 score with X_test and Y_test =",
          round(r2_score(Y_test, lm.predict(X_test)), 3))

    # Apply Ridge Regression Model
    rmodel = Ridge(alpha=0.1)
    rmodel.fit(X_train, Y_train)

    plt.figure(6)
    plt.scatter(Y_train, rmodel.predict(X_train), c='c', s=30, alpha=0.5)
    plt.xlabel("Measured call time [s]")
    plt.ylabel("Predicted call time [s]")
    plt.title("Predicted vs measured (Ridge Regression)")
    x = [0, 50]
    y = x
    lines = plt.plot(x, y)
    plt.setp(lines, color='k', linewidth=2.0)
    plt.xlim((0, 50))
    plt.ylim((0, 50))

    print("Ridge regression model \n  Train dataset")
    print("  Mean squared error =",
          round(mse(Y_train, rmodel.predict(X_train)), 5))
    print("  R2 score =", round(r2_score(Y_train, rmodel.predict(X_train)), 5))

    sampleId = np.linspace(1, 3000, 3000)

    plt.figure(7)
    plt.subplot(2, 1, 1)
    plt.plot(sampleId,
             rmodel.predict(X_train),
             'go',
             markersize=2,
             alpha=0.5,
             label='Predicted call time [s] vs Id \nRidge Regression')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 3000))
    plt.ylim((0, 50))
    plt.subplot(2, 1, 2)
    plt.plot(sampleId,
             Y_train,
             'bo',
             markersize=2,
             alpha=0.5,
             label='Measured call time [s] vs Id \nRaw data')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 3000))
    plt.ylim((0, 50))

    # Apply Random Forest Regression Model
    rfmodel = RandomForestRegressor()
    rfmodel.fit(X_train, Y_train.ave_callTime)

    print(X_train)

    print(Y_train)

    plt.figure(8)
    plt.scatter(Y_train, rfmodel.predict(X_train), c='g', s=30, alpha=0.5)
    plt.xlabel("Measured call time [s]")
    plt.ylabel("Predicted call time [s]")
    plt.title("Predicted vs measured (Random Forest Regression)")
    x = [0, 50]
    y = x
    lines = plt.plot(x, y)
    plt.setp(lines, color='k', linewidth=2.0)
    plt.xlim((0, 50))
    plt.ylim((0, 50))

    print("Random Forest Regression model \n  Train dataset")
    print("  Mean squared error =",
          round(mse(Y_train, rfmodel.predict(X_train)), 5))
    print("  R2 score =", round(r2_score(Y_train, rfmodel.predict(X_train)),
                                5))

    plt.figure(9)
    plt.subplot(2, 1, 1)
    plt.plot(sampleId,
             rfmodel.predict(X_train),
             'go',
             markersize=2,
             alpha=0.5,
             label='Predicted call time [s] vs Id \nRandom Forest Regression')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 3000))
    plt.ylim((0, 50))
    plt.subplot(2, 1, 2)
    plt.plot(sampleId,
             Y_train,
             'bo',
             markersize=2,
             alpha=0.5,
             label='Measured call time [s] vs Id \nRaw data')
    plt.legend(loc="upper right",
               bbox_to_anchor=[1, 1],
               ncol=2,
               shadow=True,
               fancybox=True)
    plt.xlim((0, 3000))
    plt.ylim((0, 50))

    plt.show()
예제 #53
0
                              p=2,
                              metric='minkowski',
                              metric_params=None,
                              n_jobs=None)
    reg.fit(Xtrn, ytrn)
    preds = reg.predict(Xtst)
    preds = np.transpose(preds)

    print('Make and save evaluations')
    # make the coloumn names and initial df
    col_names = ['GPL', 'Model', 'Metric', 'Value', 'GeneIdx']
    df_eval = pd.DataFrame(columns=col_names)
    # get metrics
    mae_values = mae(ydata_aGPL, preds, multioutput='raw_values')
    df_eval = add_eval_to_df(df_eval, mae_values, 'mae', 'SampleKNN',
                             col_names, aGPL)
    rmse_values = np.sqrt(
        mse(ydata_aGPL, preds,
            multioutput='raw_values'))  # correct to have ytst_GL
    cvrmse_values = rmse_values / np.mean(ydata_aGPL,
                                          axis=0)  # correct to have ytst_GL
    df_eval = add_eval_to_df(df_eval, cvrmse_values, 'cvrmse', 'SampleKNN',
                             col_names, aGPL)
    # save the dataframe
    df_eval.to_csv(fp_save + '%s_SampleKNN_evals.tsv' % aGPL,
                   sep='\t',
                   header=True,
                   index=False)

print('It took', int((time.time() - tic0) / 60),
      'minutes for the script to run')
def test_result(model,
                n_tests,
                test_data,
                test_labels,
                test_data_raw,
                library="scikit"):
    """
    params: library - default "scikit", other option: "torch"
    params: test_data - [pandas DF or torch] - input data
    params: test_labels - [pandas DF or torch] - labels data
    params: test_data_raw - [pandas DF] - input data before pipeline preprocesing
    """
    sum_errors = []
    if library == "scikit":
        prediction_all = model.predict(test_data)
        nn_mse = mse(test_labels, prediction_all)
        nn_rmse = np.sqrt(nn_mse)
        score = model.score(test_data, test_labels)
    elif library == "torch":
        prediction_all = model(test_data.float())
        nn_mse = mse(test_labels.numpy(), prediction_all.detach().numpy())
        nn_rmse = np.sqrt(nn_mse)
        score = 0
    else:
        print("error, choose scikit or torch library")

    for sample in range(n_tests):
        if library == "scikit":
            prediction = model.predict(test_data)[sample]
            y_real_value = test_labels.iloc[sample]
        else:
            prediction = model(test_data[sample].float()).item()
            y_real_value = test_labels[sample].item()

        country = test_data_raw.iloc[sample]['country_from']
        trans = test_data_raw.iloc[sample]['transmission']
        fuell = test_data_raw.iloc[sample]['fuell']
        milage = test_data_raw.iloc[sample]['milage']
        engine_power = test_data_raw.iloc[sample]['engine_power']
        year = test_data_raw.iloc[sample]['year']
        brand = test_data_raw.iloc[sample]['car_brand']
        car_model = test_data_raw.iloc[sample]['car_model']

        error_percentage = ((-(y_real_value - prediction) / y_real_value) *
                            100)
        sum_errors.append(np.absolute(error_percentage))
        max_error = max(sum_errors)
        print(
            "pred: {:7.0f}, real: {:7.0f}, err.rate: {:6.2f}%, country: {:16}, trans: {:13}, fuell: {:8}, br: {:15}, md: {:13}, year: {:4}, milage: {:6.0f}, pwr: {:.0f}"
            .format(prediction, y_real_value, error_percentage, country, trans,
                    fuell, brand, car_model, year, milage, engine_power))

    final_log = 'average error: {:7.2f}%, median error: {:7.2f}%, absolute error: {:7.0f}, score: {:7.3f}, max error: {:7.2f}%, set size: {}, lib: {}'.format(
        np.mean(sum_errors), np.median(sum_errors), nn_rmse, score, max_error,
        (test_data_raw.shape[0] / 2) * 10, library)
    print(final_log)

    send_email(f"car prediction training completed, with results {final_log}")

    with open("learning_history.txt", "a") as text_file:
        today = datetime.datetime.now()
        text_file.write("\n{}  {}".format(today, final_log))
예제 #55
0
            print(h)

    print("reshaping data...")
    Xs = dict()
    ys = dict()
    for h in feats:
        samples = len(feats[h]) // 384
        Xs[h] = np.array(feats[h]).reshape((samples, 6, 8, 8))
        ys[h] = np.array(labels[h])

    mse_scores = dict()
    acc_scores = dict()

    for h in feats:
        y_pred = mlp.predict(np.array(Xs[h]))
        mse_scores[h] = mse(y_pred, ys[h])
        acc_scores[h] = accuracy_score([[round(y[0])] for y in y_pred], ys[h])

    acc_data = []
    mse_data = []
    h_data = []
    for h in sorted(feats.keys()):
        h_data.append(h)
        mse_data.append(mse_scores[h])
        acc_data.append(1 - acc_scores[h])

    # fig, ax1 = plt.subplots()
    # fig.suptitle('Tree Height v. Prediction Error')
    # plt1 = ax1.plot(h_data, mse_data, color='blue', label='mse')
    # ax1.set_ylabel('mean squared regression error (MSE)')
    # ax1.set_xlabel('game tree height, h')
예제 #56
0
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error as mse
import warnings

warnings.simplefilter(action='ignore')

data = pd.read_csv('../../data/province-biweek-counts.csv')

with open('../../output/all_prov_no_prob/prov_10_no_prob/prov_10_for_2013.pkl',
          'rb') as file:
    forecast = pickle.load(file)

province = 10
year = 2013

true_df = data.loc[(data['province'] == province) & (data['year'] == year)]

biweek_cases = true_df['cases'].tolist()
total = sum(biweek_cases)
peak = max(biweek_cases)
peak_biweek = biweek_cases.index(peak) + 1

biweek_rmse = np.sqrt(mse(biweek_cases, forecast['biweek_cases']))
print(biweek_rmse)

year_total_rmse = np.sqrt((total - forecast['year_total'])**2)
print(forecast['year_total'])
print(year_total_rmse)
def normalizedByMeanRMSE(real,predicted):

    return np.sqrt(mse(real,predicted))/(np.concatenate((real,predicted)).mean())
예제 #58
0
Xtest = scaler.transform(xtest.reshape(-1, 1))

degs = np.arange(1, 21, 1)
ndegs = np.max(degs)
mse_train = np.empty(ndegs)
mse_test = np.empty(ndegs)
ytest_pred_stored = np.empty(ndegs, dtype=np.ndarray)
for deg in degs:
    model = LinearRegression()
    poly_features = PolynomialFeatures(degree=deg, include_bias=False)
    Xtrain_poly = poly_features.fit_transform(Xtrain)
    model.fit(Xtrain_poly, ytrain)
    ytrain_pred = model.predict(Xtrain_poly)
    Xtest_poly = poly_features.transform(Xtest)
    ytest_pred = model.predict(Xtest_poly)
    mse_train[deg - 1] = mse(ytrain_pred, ytrain)
    mse_test[deg - 1] = mse(ytest_pred, ytest)
    ytest_pred_stored[deg - 1] = ytest_pred

# Plot MSE vs degree
fig, ax = plt.subplots()
mask = degs <= 15
ax.plot(degs[mask], mse_test[mask], color='r', marker='x', label='test')
ax.plot(degs[mask], mse_train[mask], color='b', marker='s', label='train')
ax.legend(loc='upper right', shadow=True)
plt.xlabel('degree')
plt.ylabel('mse')
save_fig('polyfitVsDegree.pdf')
plt.show()

# Plot fitted functions
예제 #59
0
    def create_prophet_m(self,app_name,z1,delay=24):

        import pandas as pd
        import pymysql
        import warnings
        warnings.filterwarnings("ignore")
        from datetime import datetime, timedelta
        import logging
        from tqdm import tqdm
        from fbprophet import Prophet
        from sklearn.metrics import mean_squared_error as mse
        import math

        ### --- For realtime pred ---###

        full_df = z1.bw.iloc[0:len(z1)]
        full_df = full_df.reset_index()
        full_df.columns = ['ds','y']

        #removing outliers
        q50 = full_df.y.median()
        q100 = full_df.y.quantile(1)
        q75  = full_df.y.quantile(.75)
        #print(max(train_df.y))
        if((q100-q50) >= (2*q75)):
            #print('ind')
            full_df.loc[full_df.y>=(2*q75),'y'] = None

        #-- Realtime prediction --##
        #model 
        model_r = Prophet(yearly_seasonality=False,changepoint_prior_scale=.2)
        model_r.fit(full_df)
        future_r = model_r.make_future_dataframe(periods=delay,freq='H')
        forecast_r = model_r.predict(future_r)
        forecast_r.index = forecast_r['ds']
        #forecast 
        pred_r = pd.DataFrame(forecast_r['yhat'][len(z1):(len(z1)+delay)])
        pred_r=pred_r.reset_index()
        #--- completes realtime pred ---#

        train_end_index=len(z1.bw)-delay
        train_df=z1.bw.iloc[0:train_end_index]
        #train_df= train_df[train_df<cutter]


        test_df=z1.bw.iloc[train_end_index:len(z1)]



        train_df=train_df.reset_index()
        test_df=test_df.reset_index()
        train_df.columns=['ds','y']

        #--- removing outliers in trainset  ---#

        q50 = train_df.y.median()
        q100 = train_df.y.quantile(1)
        q75  = train_df.y.quantile(.75)
        #print(max(train_df.y))
        if((q100-q50) >= (2*q75)):
            #print('ind')
            train_df.loc[train_df.y>=(2*q75),'y'] = None

        test_df.columns=['ds','y']
        #print('len of testdf = ',len(test_df))
        #model 
        model = Prophet(yearly_seasonality=False,changepoint_prior_scale=.2)
        model.fit(train_df)
        future = model.make_future_dataframe(periods=len(test_df),freq='H')
        forecast = model.predict(future)
        forecast.index = forecast['ds']
        #forecast 
        pred = pd.DataFrame(forecast['yhat'][train_end_index:len(z1)])
        pred=pred.reset_index()
        pred_df=pd.merge(test_df,pred,on='ds',how='left')
        pred_df.dropna(inplace=True)

        df=pd.DataFrame()

        if(len(pred_df)>0):

            pred_df['error_test']=pred_df.y-pred_df.yhat



            MSE=mse(pred_df.y,pred_df.yhat)
            RMSE=math.sqrt(MSE)
            pred_df['APE']=abs(pred_df.error_test*100/pred_df.y)
            MAPE=pred_df.APE.mean()
            #print("App name:",app_name)
            #print("MSE  :",MSE)
            #print("RMSE :",RMSE)
            #print("MAPE :",MAPE)

            q98=pred_df['APE'].quantile(0.98)
            mape_q98=pred_df['APE'][pred_df.APE<pred_df['APE'].quantile(0.98)].mean()

            df = pd.DataFrame({'length':len(z1),#'predicted_t':[forcast_lag],
                                 'test_rmse':RMSE,
                                 'test_mape':MAPE,
                     #'test_ape_98':q98,
                     'test_mape_98':mape_q98},

                              index=[app_name])

        return(df,model,forecast,pred_df,pred_r)
예제 #60
0
    def _fit_nonbayes(self):
        self.session = {"tf_session": None, "saver": None, "ensemble": None}

        # define tf variables
        self._init_MLPGaussianRegressor()

        self.session["tf_session"] = tf.Session()
        self.session["tf_session"].run(tf.global_variables_initializer())

        # don't want momentum/history of weights from optimization
        self.session["saver"] = tf.train.Saver(
            [_v for _v in tf.global_variables() if "RMSProp" not in _v.name])

        for model in self.session["ensemble"]:
            self.session["tf_session"].run(
                tf.assign(model.output_mean, self.train_data.target_mean))
            self.session["tf_session"].run(
                tf.assign(model.output_std, self.train_data.target_std))

        # keep value of minibatch loss so convergence can be checked at end
        self.loss = [[] for ii in range(self.Nensemble)]

        maxiter_per_minibatch = 10
        num_minibatch = max([1, int(self.maxiter / maxiter_per_minibatch)])

        #for itr in range(self.maxiter):
        #    for model in self.session["ensemble"]:
        for model_idx, model in enumerate(self.session["ensemble"]):
            cntr = 0
            for batch_itr in range(num_minibatch):
                # can train on distinct mini batches for each ensemble
                x, y = self.train_data.next_batch()
                feed = {model.input_data: x, model.target_data: y}

                for minibatch_iter in range(maxiter_per_minibatch):

                    if self.method == "nonbayes_dropout":
                        feed.update({model.dr: self.method_args["keep_prob"]})

                    if self.method == "nonbayes":
                        _, loss = self.session["tf_session"].run(
                            [model.train_op, model.loss_value], feed)
                    if self.method == "nonbayes-mdn":
                        _ = self.session["tf_session"].run([model.train_op],
                                                           feed)
                    elif self.method == "nonbayes_dropout":
                        _, loss = self.session["tf_session"].run(
                            [model.train_op, model.nll], feed)

                    if np.mod(cntr, 10) == 0:
                        self.loss[model_idx].append(loss)

                    cntr += 1
                    if np.mod(cntr, 100) == 0:
                        # decrease learning rate
                        self.session["tf_session"].run(tf.assign(model.lr,\
                                self.method_args["learning_rate"]*(self.method_args["decay_rate"]**(cntr/100))))

            if False:
                # do final local gradient descent on full data set
                model.set_optimizer(toy_argparse({"opt_method":"gradientdescent",\
                        "learning_rate":self.method_args["learning_rate"]}))

                feed = {
                    model.input_data: self.train_data.xs_standardized,
                    model.target_data: self.train_data.ys
                }

                loss_before = self.session["tf_session"].run(
                    [model.loss_value], feed)
                for cntr in range(self.maxiter):
                    _ = self.session["tf_session"].run([model.train_op], feed)

                loss_after = self.session["tf_session"].run([model.loss_value],
                                                            feed)

                print("loss before = {} loss after = {}".format(
                    loss_before, loss_after))

        # for easy slicing upon analysis
        self.loss = np.asarray(self.loss)

        # pass in standardized data
        pred_mean, pred_std = self._predict_nonbayes(
            self.train_data.xs_standardized, self.Nensemble)

        rmse = np.sqrt(mse(self.train_data.ys, pred_mean))
        return rmse