예제 #1
0
    def load_dataset(input_filename,
                     target_filename,
                     matching_key='relative_path',
                     target_key='mean_slope',
                     latent_name_prefix='latent_'):
        Console.info("load_dataset called for: ", input_filename)

        df = pd.read_csv(
            input_filename, index_col=0
        )  # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID
        # 1) Data validation, remove invalid entries (e.g. NaN)
        print(df.head())
        df = df.dropna()
        Console.info("Total valid entries: ", len(df))
        # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined

        # 2) Let's determine number of latent-space dimensions
        # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector
        # Example: (8 dimensions: h0, h1, ... , h7)
        # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7
        n_latents = len(df.filter(regex=latent_name_prefix).columns)
        Console.info("Latent dimensions: ", n_latents)

        # 3) Key matching
        # each 'relative_path' entry has the format  slo/20181121_depthmap_1050_0251_no_slo.tif
        # where the filename is composed by [date_type_tilex_tiley_mod_type]. input and target tables differ only in 'type' field
        # let's use regex
        df['filename_base'] = df[matching_key].str.extract(
            '(?:\/)(.*_)')  # I think it is possible to do it in a single regex
        df['filename_base'] = df['filename_base'].str.rstrip('_')

        tdf = pd.read_csv(
            target_filename
        )  # expected header: relative_path	mean_slope [ ... ] mean_rugosity
        tdf = tdf.dropna()
        # target_key='mean_rugosity'
        tdf['filename_base'] = tdf[matching_key].str.extract(
            '(?:\/)(.*_)')  # I think it is possible to do it in a single regex
        tdf['filename_base'] = tdf['filename_base'].str.rstrip('_r002')

        # print (tdf.head())
        Console.info("Target entries: ", len(tdf))
        merged_df = pd.merge(df, tdf, how='right', on='filename_base')
        merged_df = merged_df.dropna()

        latent_df = merged_df.filter(regex=latent_name_prefix)
        Console.info("Latent size: ", latent_df.shape)
        target_df = merged_df[target_key]

        np_latent = latent_df.to_numpy(dtype='float')
        np_target = target_df.to_numpy(dtype='float')
        # input-output datasets are linked using the key provided by matching_key
        return np_latent, np_target, merged_df['filename_base']
예제 #2
0
    def load_toydataset(input_filename,
                        target_key='mean_slope',
                        input_prefix='latent_',
                        matching_key='relative_path'):
        Console.info("load_toydataset called for: ", input_filename)

        df = pd.read_csv(
            input_filename, index_col=0
        )  # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID
        # 1) Data validation, remove invalid entries (e.g. NaN)
        print(df.head())
        df = df.dropna()
        Console.info("Total valid entries: ", len(df))
        # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined

        # 2) Let's determine number of latent-space dimensions
        # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector
        # Example: (8 dimensions: h0, h1, ... , h7)
        # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7
        n_latents = len(df.filter(regex=input_prefix).columns)
        Console.info("Latent dimensions: ", n_latents)

        latent_df = df.filter(regex=input_prefix)
        target_df = df[target_key]
        Console.info("Latent size: ", latent_df.shape)

        np_latent = latent_df.to_numpy(dtype='float')
        np_target = target_df.to_numpy(dtype='float')
        np_uuid = df[matching_key].to_numpy()
        # input-output datasets are linked using the key provided by matching_key
        return np_latent, np_target, np_uuid
예제 #3
0
파일: launch.py 프로젝트: Erwyn/pa-poc1
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""This temporary script contains a light demonstration of the POC ability.

It needs the 'datas' subdirectory, created in this directory
and the user.yml file, included in the repository.

"""

from model import *
from dc.sqlite3 import Sqlite3Connector
from tools.console import Console


class User(Model):
    """A user model."""

    username = String()

    def __repr__(self):
        return "<user id={}, username={}>".format(self.id, repr(self.username))


# Load the stored datas
connector = Sqlite3Connector()
Model.data_connector = connector
connector.setup("data.db")
connector.record_tables([User])
console = Console({"Model": Model, "User": User})
console.launch()
예제 #4
0
파일: console.py 프로젝트: v-legoff/pa-poc3
 def execute(self, namespace):
     """Execute the command."""
     console = ConsoleTool({"server": self.server})
     console.launch()
     self.server.data_connector.loop()
예제 #5
0
파일: launch.py 프로젝트: v-legoff/pa-poc1
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


"""This temporary script contains a light demonstration of the POC ability.

It needs the 'datas' subdirectory, created in this directory
and the user.yml file, included in the repository.

"""

from dc import connectors
from model import *
from tests.model import User
from tools.console import Console

# Load the stored datas
connector = connectors["mongo"]()
Model.data_connector = connector
connector.setup_test()
connector.record_models([User])
console = Console({"Model": Model, "User": User})
console.launch()
예제 #6
0
def main(args=None):
    parser = argparse.ArgumentParser()
    add_arguments(parser)

    if len(sys.argv) == 1 and args is None: # no arggument passed? error, some parameters were expected
        # Show help if no args provided
        parser.print_help(sys.stderr)
        sys.exit(2)
    args = parser.parse_args(args)  # retrieve parsed arguments

    Console.info("Bayesian Neural Network for hi-res inference from low res acoustic priors (LGA-Bathymetry)")
    # let's check if input files exist
    if os.path.isfile(args.target):
        Console.info("Target input file: ", args.target)
    else:
        Console.error("Target input file [" + args.target + "] not found. Please check the provided input path (-t, --target)")

    if os.path.isfile(args.latent):
        Console.info("Latent input file: ", args.latent)
    else:
        Console.error("Latent input file [" + args.latent + "] not found. Please check the provided input path (-l, --latent)")
    # check for pre-trained network
    # if output file exists, warn user
    if os.path.isfile(args.network):
        Console.warn("Destination trained network file [", args.network, "] already exists. It will be overwritten (default action)")
    else:
        Console.info("Destination trained network: ", args.network)

    if os.path.isfile(args.output):
        Console.warn("Output file [", args.output, "] already exists. It will be overwritten (default action)")
    else:
        Console.info("Output file: ", args.output)
    # it can be "none"

    if (args.epochs):
        num_epochs = args.epochs
    else:
        num_epochs = 150

    if (args.samples):
        n_samples = args.samples
    else:
        num_epochs = 20

    if (args.key):
        col_key = args.key
    else:
        col_key = 'mean_slope'

    if (args.xinput):
        input_key = args.key
    else:
        input_key = 'latent_'

    # // TODO : add arg parser, admit input file (dataset), config file, validation dataset file, mode (train, validate, predict)
    Console.info("Geotech landability/measurability predictor from low-res acoustics. Uses Bayesian Neural Networks as predictive engine")
    dataset_filename = args.latent # dataset containing the predictive input. e.g. the latent vector
    target_filename = args.target  # output variable to be predicted, e.g. mean_slope
    # dataset_filename = "data/output-201811-merged-h14.xls"     # dataset containing the predictive input
    # target_filename = "data/target/koyo20181121-stat-r002-slo.csv"  # output variable to be predicted
    Console.info("Loading dataset: " + dataset_filename)

    X, y, index_df = CustomDataloader.load_dataset(dataset_filename, target_filename, matching_key='relative_path', target_key = col_key)    # relative_path is the common key in both tables
    # X, y, index_df = CustomDataloader.load_toydataset(dataset_filename, target_key = col_key, input_prefix= input_key, matching_key='uuid')    # relative_path is the common key in both tables

    Console.info("Data loaded...")
    # y = y/10    #some rescale    WARNING

    #X = X/10.0
    # n_sample = X.shape[0]
    n_latents = X.shape[1]
    # X = StandardScaler().fit_transform(X)
    # y = StandardScaler().fit_transform(np.expand_dims(y, -1)) # this is resizing the array so it can match Size (D,1) expected by pytorch

    # norm = MinMaxScaler().fit(y)
    # y_norm = norm.transform(y)      # min max normalization of our input data
    # y_norm = (y - 5.0)/30.0
    y_norm = y

    norm = MinMaxScaler().fit(X)
    X_norm = norm.transform(X)      # min max normalization of our input data

    print ("X [min,max]", np.amin(X),"/", np.amax(X))
    print ("X_norm [min,max]", np.amin(X_norm),"/", np.amax(X_norm))
    print ("Y [min,max]", np.amin(y),"/", np.amax(y))

    X_train, X_test, y_train, y_test = train_test_split(X_norm,
                                                        y_norm,
                                                        test_size=.25, # 3:1 ratio
                                                        shuffle = True) 

    X_train, y_train = torch.tensor(X_train).float(), torch.tensor(y_train).float()
    X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float()

    y_train = torch.unsqueeze(y_train, -1)  # PyTorch will complain if we feed the (N) tensor rather than a (NX1) tensor
    y_test = torch.unsqueeze(y_test, -1)    # we add an additional dummy dimension
    # sys.exit(1)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    regressor = BayesianRegressor(n_latents, 1).to(device)  # Single output being predicted
    # regressor.init
    optimizer = optim.Adam(regressor.parameters(), lr=0.002) # learning rate
    criterion = torch.nn.MSELoss()

    # print("Model's state_dict:")
    # for param_tensor in regressor.state_dict():
    #     print(param_tensor, "\t", regressor .state_dict()[param_tensor].size())

    ds_train = torch.utils.data.TensorDataset(X_train, y_train)
    dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=16, shuffle=True)

    ds_test = torch.utils.data.TensorDataset(X_test, y_test)
    dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=16, shuffle=True)

    iteration = 0
    # Training time
    test_hist = []
    uncert_hist = []
    train_hist = []
    fit_hist = []
    ufit_hist = []

    elbo_kld = 1.0

    print ("ELBO KLD factor: ", elbo_kld/X_train.shape[0]);

    for epoch in range(num_epochs):
        train_loss = []
        for i, (datapoints, labels) in enumerate(dataloader_train):
            optimizer.zero_grad()
            
            loss = regressor.sample_elbo(inputs=datapoints.to(device),
                            labels=labels.to(device),
                            criterion=criterion,    # MSELoss
                            sample_nbr=n_samples,
                            complexity_cost_weight=elbo_kld/X_train.shape[0])  # normalize the complexity cost by the number of input points
            loss.backward() # the returned loss is the combination of fit loss (MSELoss) and complexity cost (KL_div against the )
            optimizer.step()
            train_loss.append(loss.item())
            
        test_loss = []
        fit_loss = []

        for k, (test_datapoints, test_labels) in enumerate(dataloader_test):
            sample_loss = regressor.sample_elbo(inputs=test_datapoints.to(device),
                                labels=test_labels.to(device),
                                criterion=criterion,
                                sample_nbr=n_samples,
                                complexity_cost_weight=elbo_kld/X_test.shape[0])

            fit_loss_sample = regressor.sample_elbo(inputs=test_datapoints.to(device),
                                labels=test_labels.to(device),
                                criterion=criterion,
                                sample_nbr=n_samples,
                                complexity_cost_weight=0)   # we are interested in the reconstruction/prediction loss only (no KL cost)

            test_loss.append(sample_loss.item())
            fit_loss.append(fit_loss_sample.item())

        mean_test_loss = statistics.mean(test_loss)
        stdv_test_loss = statistics.stdev(test_loss)

        mean_train_loss = statistics.mean(train_loss)

        mean_fit_loss = statistics.mean(fit_loss)
        stdv_fit_loss = statistics.stdev(fit_loss)

        Console.info("Epoch [" + str(epoch) + "] Train loss: {:.4f}".format(mean_train_loss) + " Valid. loss: {:.4f}".format(mean_test_loss) + " Fit loss: {:.4f}  ***".format(mean_fit_loss) )
        Console.progress(epoch, num_epochs)

        test_hist.append(mean_test_loss)
        uncert_hist.append(stdv_test_loss)
        train_hist.append(mean_train_loss)

        fit_hist.append(mean_fit_loss)
        ufit_hist.append(stdv_fit_loss)

        # train_hist.append(statistics.mean(train_loss))

        # if (epoch % 50) == 0:   # every 50 epochs, we save a network snapshot
        #     temp_name = "bnn_model_" + str(epoch) + ".pth"
        #     torch.save(regressor.state_dict(), temp_name)

    Console.info("Training completed!")
    # torch.save(regressor.state_dict(), "bnn_model_N" + str (num_epochs) + ".pth")
    torch.save(regressor.state_dict(), args.network)

    export_df = pd.DataFrame([train_hist, test_hist, uncert_hist, fit_hist, ufit_hist]).transpose()
    export_df.columns = ['train_error', 'test_error', 'test_error_stdev', 'test_loss', 'test_loss_stdev']

    print ("head", export_df.head())
    output_name = "bnn_training_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv"
    export_df.to_csv(output_name)
    # export_df.to_csv("bnn_train_report.csv")
    # df = pd.read_csv(input_filename, index_col=0) # use 1t column as ID, the 2nd (relative_path) can be used as part of UUID

    # Once trained, we start inferring
    expected = []
    uncertainty = []
    predicted = [] # == y

    Console.info("testing predictions...")
    idx = 0 
    # for x in X_test:
    Xp_ = torch.tensor(X_norm).float()

    for x in Xp_:
        predictions = []
        for n in range(n_samples):
            p = regressor(x.to(device)).item()
            # print ("p.type", type(p)) ----> float
            # print ("p.len", len(p))
            predictions.append(p) #1D output, retieve single item

        # print ("pred.type", type(predictions))
        # print ("pred.len", len(predictions))    ---> 10 (n_samples)

        p_mean = statistics.mean(predictions)
        p_stdv = statistics.stdev(predictions)
        idx = idx + 1

        # print ("p_mean", type(p_mean))  --> float

        predicted.append(p_mean)
        uncertainty.append(p_stdv)


        Console.progress(idx, len(Xp_))

    # print ("predicted:" , predicted)
    # print ("predicted.type", type(predicted))
    # print ("predicted.len", len(predicted))
    # print ("X.len:" , len(X_test))
    # y_list = y_train.squeeze().tolist()
    y_list = y_norm.squeeze().tolist()
    # y_list = y_test.squeeze().tolist()

    # y_list = [element.item() for element in y_test.flatten()]

    xl = np.squeeze(X_norm).tolist()

    # print ("y_list.len", len(y_list))
    # predicted.len = X.len (as desired)
    # pred_df  = pd.DataFrame ([xl, y_list, predicted, uncertainty, index_df]).transpose()
    pred_df  = pd.DataFrame ([y_list, predicted, uncertainty, index_df]).transpose()
    # pred_df  = pd.DataFrame ([y_list, predicted, uncertainty, index_df.values.tolist() ]).transpose()
    # pred_df.columns = ['Xp_', 'y', 'predicted', 'uncertainty', 'index']
    pred_df.columns = ['y', 'predicted', 'uncertainty', 'index']

    output_name = "bnn_predictions_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv"
    # output_name = args.output
    pred_df.to_csv(output_name)