Exemplos de shuffle_and_split em Python, exemplos de fffit.utils.shuffle_and_split em Python

Exemplo n.º 1

0

Exibir arquivo

out_csv_name = "r125-vle-iter" + str(iternum + 1) + "-params.csv"

# Read files
df_csvs = [
    pd.read_csv(csv_path + in_csv_name, index_col=0)
    for in_csv_name in in_csv_names
]
df_csv = pd.concat(df_csvs)
df_all = prepare_df_vle(df_csv, R125)

### Fit GP Model to liquid density
param_names = list(R125.param_names) + ["temperature"]
property_name = "sim_liq_density"
x_train, y_train, x_test, y_test = shuffle_and_split(
    df_all,
    param_names,
    property_name,
    shuffle_seed=gp_shuffle_seed,
    fraction_train=0.8)

# Fit model
models = {}
models["RBF"] = run_gpflow_scipy(
    x_train,
    y_train,
    gpflow.kernels.RBF(lengthscales=np.ones(R125.n_params + 1)),
)
models["Matern32"] = run_gpflow_scipy(
    x_train,
    y_train,
    gpflow.kernels.Matern32(lengthscales=np.ones(R125.n_params + 1)),
)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: id-new-samples.py Projeto: rsdefever/hfcs-fffit

    pd.read_csv(csv_path + in_csv_name, index_col=0)
    for in_csv_name in in_csv_names
]
df_csv = pd.concat(df_csvs)
df_all, df_liquid, df_vapor = prepare_df_density(df_csv, R125,
                                                 liquid_density_threshold)

### Fit GP models to VLE data
# Create training/test set
param_names = list(R125.param_names) + ["temperature"]
property_names = ["sim_liq_density", "sim_vap_density", "sim_Pvap", "sim_Hvap"]

vle_models = {}
for property_name in property_names:
    # Get train/test
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_vle, param_names, property_name, shuffle_seed=gp_shuffle_seed)

    # Fit model
    vle_models[property_name] = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.RBF(lengthscales=np.ones(R125.n_params + 1)),
    )

# For vapor density replace with Matern52 kernel
property_name = "sim_vap_density"
# Get train/test
x_train, y_train, x_test, y_test = shuffle_and_split(
    df_vle, param_names, property_name, shuffle_seed=gp_shuffle_seed)
# Fit model
vle_models[property_name] = run_gpflow_scipy(

Exemplo n.º 3

0

Exibir arquivo

Arquivo: id-new-samples.py Projeto: rsdefever/ap-fffit

def main():

    # Make sure we have a folder for figures
    try:
        os.mkdir("figs")
    except FileExistsError:
        pass

    ###########################################################
    ####################   Fit GP models    ###################
    ###########################################################
    ## UCMD Model
    param_names = list(AP.param_names)
    property_name = "uc_mean_distance"
    # Only train on 10 K data that meets ucmd clf threshold
    df_ucmd = df_all.loc[(df_all["temperature"] == 10)
                         & (df_all["uc_mean_distance"] < ucmd_clf_threshold)]
    # Get train/test
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_ucmd, param_names, property_name, shuffle_seed=gp_shuffle_seed)
    # Fit model
    ucmd_gp = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params)),
    )

    ## Lattice APE Model
    param_names = list(AP.param_names) + ["scaled_temperature"]
    property_name = "lattice_ape"
    # Get train/test
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed)
    # Fit model
    lattice_gp = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params + 1)),
    )

    ###########################################################
    ##################   Train classifers    ##################
    ###########################################################

    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all.loc[df_all["temperature"] == 10],
        list(AP.param_names),
        "uc_mean_distance",
        shuffle_seed=clf_shuffle_seed,
    )

    y_train = np.array(y_train < ucmd_clf_threshold, dtype=np.int32)
    y_test = np.array(y_test < ucmd_clf_threshold, dtype=np.int32)
    ucmd_clf = svm.SVC(kernel="rbf")
    ucmd_clf.fit(x_train, y_train)
    y_pred = ucmd_clf.predict(x_train)
    print("Training accuracy:", metrics.accuracy_score(y_train, y_pred))
    y_pred = ucmd_clf.predict(x_test)
    print("Testing accuracy:", metrics.accuracy_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

    ###########################################################
    ###################   Find new points   ###################
    ###########################################################

    # Load large hypercube
    latin_hypercube = np.loadtxt("LHS_1e6x8.csv", delimiter=",")

    # Apply UCMD classifier
    ucmd_pred = ucmd_clf.predict(latin_hypercube)

    # Predict UCMD with GP model
    gp_means_ucmd, gp_vars_ucmd = ucmd_gp.predict_f(latin_hypercube)
    # Predict Lattice APE with GP model at each temperature
    all_errs = np.empty(shape=(latin_hypercube.shape[0], len(temperatures)))
    col_idx = 0
    for temperature in temperatures:
        scaled_temperature = values_real_to_scaled(temperature,
                                                   AP.temperature_bounds)
        xx = np.hstack((latin_hypercube,
                        np.tile(scaled_temperature,
                                (latin_hypercube.shape[0], 1))))
        gp_means_ape, gp_vars_ape = lattice_gp.predict_f(xx)
        all_errs[:, col_idx] = gp_means_ape[:, 0]
        col_idx += 1
    # Compute MAPE across all three temperatures
    mean_errs = np.mean(all_errs, axis=1)

    ## Save all the results to a dataframe
    LH_results = pd.DataFrame(latin_hypercube, columns=AP.param_names)
    LH_results["ucmd_clf"] = ucmd_pred.astype(np.bool_)
    LH_results["ucmd"] = gp_means_ucmd.numpy()
    LH_results["lattice_mape"] = mean_errs

    # Only take points where structure classifier is satisifed
    LH_results_pass_ucmd_clf = LH_results.loc[LH_results.ucmd_clf == True]
    costs = LH_results_pass_ucmd_clf[["ucmd", "lattice_mape"]].to_numpy()

    # Find pareto efficient points
    result, pareto_points, dominated_points = find_pareto_set(
        costs, is_pareto_efficient)
    LH_results_pass_ucmd_clf["is_pareto"] = result

    # Plot pareto points vs. costs
    g = seaborn.pairplot(
        LH_results_pass_ucmd_clf,
        vars=["ucmd", "lattice_mape"],
        hue="is_pareto",
    )
    g.savefig("figs/pareto-mses.png", dpi=300)

    # Plot pareto points vs. params
    g = seaborn.pairplot(LH_results_pass_ucmd_clf,
                         vars=list(AP.param_names),
                         hue="is_pareto")
    g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1))
    g.savefig("figs/pareto-params.png", dpi=300)

    # For next iteration: 1. All non-dominated points that meet the thresholds
    #                     2. "Separated" dominated points that meet the thresholds

    next_iteration_points = LH_results_pass_ucmd_clf.loc[
        (LH_results_pass_ucmd_clf.is_pareto == True)
        & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) &
        (LH_results_pass_ucmd_clf.lattice_mape <
         lattice_mape_next_itr_threshold)]
    dominated_points = LH_results_pass_ucmd_clf.loc[
        (LH_results_pass_ucmd_clf.is_pareto == False)
        & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) &
        (LH_results_pass_ucmd_clf.lattice_mape <
         lattice_mape_next_itr_threshold)]

    print(f"{len(LH_results_pass_ucmd_clf)} points meet the ucmd classifier.")
    print(
        f"{len(LH_results_pass_ucmd_clf[LH_results_pass_ucmd_clf.is_pareto == True])} are non-dominated."
    )
    print(
        f"{len(dominated_points)} are dominated with ucmd < {ucmd_next_itr_threshold} and lattice_mape < {lattice_mape_next_itr_threshold}"
    )

    removal_distance = 1.034495
    np.random.seed(distance_seed)
    discarded_points = pd.DataFrame(columns=dominated_points.columns)

    while len(dominated_points > 0):
        # Shuffle the top parameter sets
        dominated_points = dominated_points.sample(frac=1)

        # Select one off the top
        # Note: here we use a random one rather than the "best" one; we don't have
        # confidence that the GP models are more accurate than our thresholds
        next_iteration_points = next_iteration_points.append(
            dominated_points.iloc[[0]])

        # Remove anything within given distance
        l1_norm = np.sum(
            np.abs(
                dominated_points[list(AP.param_names)].values -
                next_iteration_points[list(AP.param_names)].iloc[[-1]].values),
            axis=1,
        )

        points_to_remove = np.where(l1_norm < removal_distance)[0]
        discarded_points = discarded_points.append(
            dominated_points.iloc[points_to_remove])
        dominated_points.drop(index=dominated_points.index[points_to_remove],
                              inplace=True)

    print(
        f"After removing similar points, we are left with {len(next_iteration_points)} final top points."
    )

    next_iteration_points = next_iteration_points[:250]
    next_iteration_points.drop(columns=["ucmd", "lattice_mape", "is_pareto"],
                               inplace=True)

    # Plot new points
    g = seaborn.pairplot(next_iteration_points, vars=list(AP.param_names))
    g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1))
    g.savefig("figs/new-points-params.png", dpi=300)

    # Save the final new parameters
    next_iteration_points.to_csv(csv_path + out_csv_name)

Exemplo n.º 4

0

Exibir arquivo

def main():

    seaborn.set_palette("Paired")

    # Liquid density first
    param_names = list(R32.param_names) + ["temperature"]
    property_name = "sim_liq_density"
    property_bounds = R32.liq_density_bounds

    # Extract train/test data
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all,
        param_names,
        property_name,
        shuffle_seed=gp_shuffle_seed,
        fraction_train=0.8,
    )

    # Fit model
    model = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.RBF(lengthscales=np.ones(R32.n_params + 1)),
    )

    # Use model to predict results
    gp_mu_train, gp_var_train = model.predict_f(x_train)
    gp_mu_test, gp_var_test = model.predict_f(x_test)

    # Convert results to physical values
    y_train_physical = values_scaled_to_real(y_train, property_bounds)
    y_test_physical = values_scaled_to_real(y_test, property_bounds)
    gp_mu_train_physical = values_scaled_to_real(gp_mu_train, property_bounds)
    gp_mu_test_physical = values_scaled_to_real(gp_mu_test, property_bounds)

    # Plot
    fig, ax = plt.subplots()

    ax.scatter(
        y_train_physical,
        gp_mu_train_physical,
        label="Train",
        alpha=0.4,
        s=130,
        c="C1",
    )
    ax.scatter(
        y_test_physical,
        gp_mu_test_physical,
        marker="+",
        label="Test",
        alpha=0.7,
        s=170,
        c="C5",
    )

    xylim = [750, 1250]

    ax.plot(
        np.arange(xylim[0], xylim[1] + 100, 100),
        np.arange(xylim[0], xylim[1] + 100, 100),
        color="black",
        linewidth=3,
        alpha=0.6,
    )

    ax.set_xlim(xylim[0], xylim[1])
    ax.set_ylim(xylim[0], xylim[1])

    ax.set_xticks([800, 1000, 1200])
    ax.set_yticks([800, 1000, 1200])
    ax.set_xticks([850, 900, 950, 1050, 1100, 1150], minor=True)
    ax.set_yticks([850, 900, 950, 1050, 1100, 1150], minor=True)

    ax.tick_params("both",
                   direction="in",
                   which="both",
                   length=4,
                   labelsize=26,
                   pad=10)
    ax.tick_params("both", which="major", length=8)
    ax.xaxis.set_ticks_position("both")
    ax.yaxis.set_ticks_position("both")

    ax.set_xlabel(r"$\mathregular{\rho_{liq}\ sim. (kg/m^3)}$",
                  fontsize=28,
                  labelpad=20)
    ax.set_ylabel(r"$\mathregular{\rho_{liq}\ sur. (kg/m^3)}$",
                  fontsize=28,
                  labelpad=10)
    ax.legend(fontsize=24,
              handletextpad=0.00,
              loc="lower right",
              bbox_to_anchor=(1.01, -0.01))

    ax.set_aspect("equal", "box")
    fig.tight_layout()
    fig.savefig("pdfs/fig1-surrogate-liquiddensity.pdf")

    # Vapor density next
    param_names = list(R32.param_names) + ["temperature"]
    property_name = "sim_vap_density"
    property_bounds = R32.vap_density_bounds

    # Extract train/test data
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all,
        param_names,
        property_name,
        shuffle_seed=gp_shuffle_seed,
        fraction_train=0.8,
    )

    # Fit model
    model = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.RBF(lengthscales=np.ones(R32.n_params + 1)),
    )

    # Use model to predict results
    gp_mu_train, gp_var_train = model.predict_f(x_train)
    gp_mu_test, gp_var_test = model.predict_f(x_test)

    # Convert results to physical values
    y_train_physical = values_scaled_to_real(y_train, property_bounds)
    y_test_physical = values_scaled_to_real(y_test, property_bounds)
    gp_mu_train_physical = values_scaled_to_real(gp_mu_train, property_bounds)
    gp_mu_test_physical = values_scaled_to_real(gp_mu_test, property_bounds)

    # Plot
    fig, ax = plt.subplots()

    ax.scatter(
        y_train_physical,
        gp_mu_train_physical,
        label="Train",
        alpha=0.6,
        s=130,
        c="C1",
    )
    ax.scatter(
        y_test_physical,
        gp_mu_test_physical,
        marker="+",
        label="Test",
        alpha=0.8,
        s=170,
        c="C5",
    )

    xylim = [0, 125]

    ax.plot(np.arange(xylim[0], xylim[1] + 100, 100),
            np.arange(xylim[0], xylim[1] + 100, 100),
            color="black",
            linewidth=3,
            alpha=0.6)

    ax.set_xlim(xylim[0], xylim[1])
    ax.set_ylim(xylim[0], xylim[1])

    ax.set_xticks([0, 50, 100])
    ax.set_yticks([0, 50, 100])
    ax.set_xticks([25, 75, 125], minor=True)
    ax.set_yticks([25, 75, 125], minor=True)

    ax.tick_params("both",
                   direction="in",
                   which="both",
                   length=4,
                   labelsize=26,
                   pad=10)
    ax.tick_params("both", which="major", length=8)
    ax.xaxis.set_ticks_position("both")
    ax.yaxis.set_ticks_position("both")

    ax.set_xlabel(r"$\mathregular{\rho_{vap}\ sim. (kg/m^3)}$",
                  fontsize=28,
                  labelpad=20)
    ax.set_ylabel(r"$\mathregular{\rho_{vap}\ sur. (kg/m^3)}$",
                  fontsize=28,
                  labelpad=10)
    ax.legend(fontsize=24, handletextpad=0.00)

    ax.set_aspect("equal", "box")
    fig.tight_layout()
    fig.savefig("pdfs/fig1-surrogate-vapordensity.pdf")