Пример #1
0
def plot_production_rate_and_injection_rate():
    for producer_name in ['PA12']:
        producer = get_real_producer_data(producers_df, producer_name)
        injectors = []
        for name in injector_names:
            injector = injectors_df.loc[injectors_df['Name'] == name,
                                        ['Water Vol', 'Date']]
            injectors.append(injector)
        plt.plot(producer['Date'], producer[producer_name], alpha=0.5)
        for i in range(len(injectors)):
            injector = injectors[i]
            plt.plot(injector['Date'],
                     injector['Water Vol'],
                     alpha=0.5,
                     label='Injector {}'.format(i + 1))
        dates = producer['Date'].tolist()
        middle_date = dates[int(len(dates) * 0.40)]
        plt.vlines(middle_date, 0, 50000, linewidth=1, alpha=0.8)
        print(middle_date)
        plt.gcf().autofmt_xdate()
        plt.title(producer_name)
        plt.xlabel('Dates')
        plt.ylabel('Production Rate [bbls/day]')
        plt.legend()
        plt.show()
Пример #2
0
def determine_train_test_split():
    # producer_names = ['PA01', 'PA02', 'PA03', 'PA09', 'PA10', 'PA12']
    train_sizes = np.linspace(0.1, 0.9, 81)
    for i in [4]:
        # Constructing dataset
        name = producer_names[i]
        print(name)
        producer = get_real_producer_data(producers_df, name, bhp=True)
        injectors = injectors_df[['Name', 'Date', 'Water Vol']]
        X, y = construct_real_production_rate_dataset(producer, injectors)
        for train_size in train_sizes:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, train_size=train_size, shuffle=False)
            X_train = X_train.to_numpy()
            X_test = X_test.to_numpy()
            y_train = y_train.to_numpy()
            y_test = y_test.to_numpy()
            train_length = len(X_train)
            t_fit = np.linspace(0, train_length - 1, train_length)
            t_test = np.linspace(train_length, (train_length + 29), 30)

            model = CrmpBHP().fit(X_train, y_train)
            model.q0 = y_train[-1]
            y_hat = model.predict(X_test[:30, 1:])

            plt.plot(t_test, y_test[:30], color='k', label='True Value')
            plt.plot(t_test, y_hat, color='r', label='Prediction')
            plot_helper(FIG_DIR,
                        title='{}: {} Train Size'.format(name, train_size),
                        xlabel='Days',
                        ylabel='Production Rate [bbls/day]',
                        legend=True,
                        save=False)
            plt.show()
Пример #3
0
def plot_delta_bhp():
    for name in producer_names:
        producer = get_real_producer_data(producers_df, name, bhp=True)
        delta_p = producer['delta_p']
        l = len(delta_p)
        t = np.linspace(1, l, l)
        plt.plot(t, delta_p)
        plot_helper(FIG_DIR,
                    title=name,
                    xlabel='Time [days]',
                    ylabel='Change in Bottom Hole Pressure [psi]',
                    save=True)
Пример #4
0
def plot_imputed_and_original_production_rate():
    for name in producer_names:
        producer = get_real_producer_data(producers_df, name)
        original_data = deepcopy(producer[name])
        l = len(producer)
        y = np.zeros(l)
        impute_training_data(producer, y, name)[0]
        t = np.linspace(1, l, l)
        plt.plot(t, original_data)
        plt.plot(t, producer[name])
        plot_helper(FIG_DIR,
                    title='{}: Imputed Production Data'.format(name),
                    xlabel='Time [days]',
                    ylabel='Producer Rate [bbls/day]',
                    save=True)
Пример #5
0
def train_bagging_regressor_with_crmp():
    # producer_names = ['PA01', 'PA02', 'PA03', 'PA09', 'PA10', 'PA12']
    train_sizes = [0.33, 0.735, 0.49, 0.56, 0.80, 0.45, 0.54]
    for i in range(len(producer_names)):
        # Constructing dataset
        name = producer_names[i]
        print(name)
        producer = get_real_producer_data(producers_df, name, bhp=True)
        injectors = injectors_df[['Name', 'Date', 'Water Vol']]
        X, y = construct_real_production_rate_dataset(producer, injectors)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_sizes[i], shuffle=False)
        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        n_splits = len(X_train) // 30
        tscv = TimeSeriesSplit(n_splits)
        cv = tscv.split(X_train)

        # Setting up estimator
        bgr = MBBaggingRegressor(base_estimator=CrmpBHP(),
                                 n_estimators=100,
                                 bootstrap=True,
                                 random_state=0)
        param_grid = {'block_size': [7, 14, 21, 28, 90]}
        gcv = GridSearchCV(bgr,
                           param_grid=param_grid,
                           scoring=scorer_for_crmp,
                           cv=cv)

        # Fitting the estimator
        gcv.fit(X_train, y_train)

        print(gcv.best_params_)
        print(gcv.best_estimator_)
        print()
        print()
Пример #6
0
def train_bagging_regressor_with_crmp():
    train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54]
    # for i in range(len(producer_names) - 1):
    n_estimators = 100
    delta_t = 1
    for i in [0, 1, 2, 3, 4, 6]:
        # Constructing dataset
        name = producer_names[i]
        print(name)
        producer = get_real_producer_data(producers_df, name, bhp=True)
        injectors = injectors_df[['Name', 'Date', 'Water Vol']]
        X, y = construct_real_production_rate_dataset(producer,
                                                      injectors,
                                                      delta_t=delta_t)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_sizes[i], shuffle=False)
        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        train_length = len(X_train)
        t_fit = np.linspace(0, train_length - 1, train_length)
        t_test = np.linspace(train_length, (train_length + 29), 30)

        # Setting up estimator
        bgr = MBBaggingRegressor(base_estimator=CrmpBHP(delta_t=delta_t),
                                 n_estimators=n_estimators,
                                 block_size=7,
                                 bootstrap=True,
                                 n_jobs=-1,
                                 random_state=0)
        bgr.fit(X_train, y_train)
        model = CrmpBHP().fit(X_train, y_train)
        y_fits = []
        for e in bgr.estimators_:
            y_hat_i = []
            for i in range(len(y_train)):
                e.q0 = X_train[i, 0]
                y_hat_i.append(e.predict(np.array([X_train[i, 1:]])))
            y_fits.append(y_hat_i)
        y_fits_by_time = np.asarray(y_fits).T.reshape(-1, n_estimators)
        y_fits_average = []
        for y_hats_i in y_fits_by_time:
            average = np.average(y_hats_i)
            y_fits_average.append(average)

        r2, mse = fit_statistics(y_fits_average, y_train)

        # Getting all bootstrapped predictions
        y_hats = []
        for e in bgr.estimators_:
            e.q0 = y_train[-1]
            y_hat_i = e.predict(X_test[:30, 1:])
            y_hats.append(y_hat_i)
        y_hats_by_time = np.asarray(y_hats).T
        p10s = []
        averages = []
        p90s = []
        for y_hats_i in y_hats_by_time:
            p10 = np.percentile(y_hats_i, 10)
            average = np.average(y_hats_i)
            p90 = np.percentile(y_hats_i, 90)
            p10s.append(p10)
            averages.append(average)
            p90s.append(p90)
        mse = fit_statistics(y_test[:30], averages)[1]

        max_train = np.amax(y_train[-100:])
        max_fit = np.amax(y_fits_average[-100:])
        max_realization = np.amax(y_hats)
        height = max(max_train, max_fit, max_realization)
        # Plotting
        plt.plot(t_fit[-100:], y_train[-100:], color='k')
        plt.plot(t_fit[-100:],
                 y_fits_average[-100:],
                 color='g',
                 label='Fitting')
        plt.plot(t_test, y_test[:30], color='k', label='True Value')
        plt.plot(t_test, averages, color='b', label='Average')
        plt.plot(t_test, p10s, color='r', alpha=0.5, label='P10 & P90')
        plt.plot(t_test, p90s, color='r', alpha=0.5)
        for hat in y_hats:
            plt.plot(t_test, hat, color='k', alpha=0.02)
        plt.annotate('r-squared = {:.4f}'.format(r2),
                     xy=(train_length - 60, height))
        plt.vlines(train_length - 1,
                   0,
                   height,
                   linewidth=2,
                   colors='k',
                   linestyles='dashed',
                   alpha=0.8)
        plot_helper(FIG_DIR,
                    title='{}: 30 Days Prediction'.format(name),
                    xlabel='Days',
                    ylabel='Production Rate [bbls/day]',
                    legend=True,
                    save=True)
Пример #7
0
def best_worse():
    train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54]
    n_estimators = 100
    delta_t = 1
    models = [
        [CrmpBHP(), False],
        [HuberRegressor(alpha=0.5, epsilon=100, fit_intercept=False), True],
        [LinearRegression(fit_intercept=False, positive=True), False],
    ]
    labels = [
        'CRMP-BHP',
        'Huber Regression (Best)',
        'Linear Regression (Worst)',
    ]

    # for i in [0, 1, 2, 3, 4, 6]:
    for i in [1]:
        # Constructing dataset
        name = producer_names[i]
        print(name)
        producer = get_real_producer_data(producers_df, name, bhp=True)
        injectors = injectors_df[['Name', 'Date', 'Water Vol']]

        X, y = construct_real_production_rate_dataset(producer,
                                                      injectors,
                                                      delta_t=delta_t)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_sizes[i], shuffle=False)

        train_length = len(X_train)
        t_fit = np.linspace(0, train_length - 1, train_length)
        t_test = np.linspace(train_length, (train_length + 29), 30)

        plt.plot(t_test,
                 y_test[:30],
                 color='k',
                 label='True Value',
                 linewidth=2)

        X_train_scaled = X_train.copy(deep=True)
        X_train_scaled[name] = log_transformation(X_train[name])
        X_test_scaled = X_test.copy(deep=True)
        X_test_scaled[name] = log_transformation(X_test[name])
        y_train_scaled = log_transformation(y_train)
        y_test_scaled = log_transformation(y_test)

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()

        X_train_scaled = X_train_scaled.to_numpy()
        X_test_scaled = X_test_scaled.to_numpy()
        y_train_scaled = y_train_scaled.to_numpy()
        y_test_scaled = y_test_scaled.to_numpy()

        for j in range(len(models)):
            model = models[j][0]
            log = models[j][1]
            print(labels[j])
            bgr = MBBaggingRegressor(base_estimator=model,
                                     n_estimators=n_estimators,
                                     block_size=7,
                                     bootstrap=True,
                                     n_jobs=-1,
                                     random_state=1)

            if log:
                bgr.fit(X_train_scaled, y_train_scaled)
            else:
                bgr.fit(X_train, y_train)

            if j == 0:
                y_hats = []
                for e in bgr.estimators_:
                    e.q0 = y_train[-1]
                    y_hat_i = e.predict(X_test[:30, 1:])
                    y_hats.append(y_hat_i)
                y_hats_by_time = np.asarray(y_hats).T
                averages = []
                for y_hats_i in y_hats_by_time:
                    average = np.average(y_hats_i)
                    averages.append(average)
                plt.plot(t_test,
                         averages,
                         label=labels[j],
                         alpha=0.5,
                         linewidth=2)
                continue

            y_hats = []
            for e in bgr.estimators_:
                if log:
                    y_hat_i = y_train_scaled[-1]
                else:
                    y_hat_i = y_train[-1]
                y_hat = []
                for k in range(30):
                    if log:
                        X_test_i = X_test_scaled[k, :]
                    else:
                        X_test_i = X_test[k, :]
                    X_test_i[0] = y_hat_i
                    X_test_i = X_test_i.reshape(1, -1)
                    y_hat_i = e.predict(X_test_i)
                    if log:
                        y_hat.append(np.exp(y_hat_i) - 1)
                    else:
                        y_hat.append(y_hat_i)
                y_hats.append(y_hat)
            y_hats_by_time = np.asarray(y_hats).T.reshape(-1, n_estimators)

            averages = []
            p50s = []
            for y_hats_i in y_hats_by_time:
                average = np.average(y_hats_i)
                p50 = np.percentile(y_hats_i, 50)
                averages.append(average)
                p50s.append(p50)

            # Plotting
            p50s = np.array(p50s).clip(min=0)
            averages = np.array(averages).clip(min=0)
            plt.plot(t_test, averages, label=labels[j], alpha=0.5, linewidth=2)

        plt.tight_layout()
        plot_helper(
            FIG_DIR,
            title=
            '{}: 30 Days Prediction for CRMP-BHP and the Best and Worst Performing ML Estimators'
            .format(name),
            xlabel='Days',
            ylabel='Production Rate [bbls/day]',
            legend=True,
            save=True)
        # plt.show()
        print()
Пример #8
0
def evaluate_crmp_bhp_model():
    iteration = 0
    for name in producer_names:
        print('Producer Name: ', name)
        producer = get_real_producer_data(producers_df, name, bhp=True)
        injectors = injectors_df[['Name', 'Date', 'Water Vol']]
        X, y = construct_real_production_rate_dataset(producer[['Date', name]],
                                                      injectors,
                                                      producer['delta_p'])
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.40,
                                                            shuffle=False)
        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        for p0 in p0s:
            iteration += 1
            print('Iteration: {}'.format(iteration))
            crmpbhp = CrmpBHP(p0=deepcopy(p0))
            crmpbhp = crmpbhp.fit(X_train, y_train)

            # Fitting
            # y_hat = crmpbhp.predict(X_train)
            # r2, mse = fit_statistics(y_hat, y_train, shutin=True)
            # fit_data['Producer'].append(name)
            # fit_data['Model'].append(model_namer(crmpbhp))
            # fit_data['tau_initial'].append(p0[0])
            # fit_data['tau_final'].append(crmpbhp.tau_)
            # fit_data['f1_initial'].append(p0[1])
            # fit_data['f1_final'].append(crmpbhp.gains_[0])
            # fit_data['f2_initial'].append(p0[2])
            # fit_data['f2_final'].append(crmpbhp.gains_[1])
            # fit_data['f3_initial'].append(p0[3])
            # fit_data['f3_final'].append(crmpbhp.gains_[2])
            # fit_data['f4_initial'].append(p0[4])
            # fit_data['f4_final'].append(crmpbhp.gains_[3])
            # fit_data['r2'].append(r2)
            # fit_data['MSE'].append(mse)

            # Prediction
            y_hat = crmpbhp.predict(X_test[:30, 1:])
            r2, mse = fit_statistics(y_hat, y_test[:30], shutin=True)
            predict_data['Producer'].append(name)
            predict_data['Model'].append(model_namer(crmpbhp))
            predict_data['tau_initial'].append(p0[0])
            predict_data['tau_final'].append(crmpbhp.tau_)
            predict_data['f1_initial'].append(p0[1])
            predict_data['f1_final'].append(crmpbhp.gains_[0])
            predict_data['f2_initial'].append(p0[2])
            predict_data['f2_final'].append(crmpbhp.gains_[1])
            predict_data['f3_initial'].append(p0[3])
            predict_data['f3_final'].append(crmpbhp.gains_[2])
            predict_data['f4_initial'].append(p0[4])
            predict_data['f4_final'].append(crmpbhp.gains_[3])
            predict_data['r2'].append(r2)
            predict_data['MSE'].append(mse)

    # Fitting
    fit_df = pd.DataFrame(fit_data)
    fit_df.to_csv(fit_output_file)

    # Prediction
    predict_df = pd.DataFrame(predict_data)
    predict_df.to_csv(predict_output_file)