Exemplos de Datasets em Python, exemplos de Modules.Datasets em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Sanitization.py Projeto: dr-dahou-adrar/DySan

 def shaping(path):
     # Set the data as the same for all, the generated ones and the original ones such that we have the same
     # computation graph.
     data = D.MotionSenseDataset(path, window_overlap=P.Window_overlap)
     return data.__inverse_transform_conv__(sensor_tensor=data.sensor, phy=data.phy_data,
                                            act_tensor=data.activities, sens_tensor=data.sensitive,
                                            user_id_tensor=data.users_id, trials=data.trials, cpu_device=CPU_DEVICE)

Exemplo n.º 2

0

Exibir arquivo

    def fit(self, train_data, target="sens", sens="sens", phys_clm="phy", epoch=200, batch_size=256, learning_rate=5e-4,
            weight_decay=0, loss_fn=Cl.BalancedErrorRateLoss(1 / 2), verbose=False):
        """
        :param train_data: data to use for training. Must be an instance of pandas DataFrame
        :param target: the name of the target column
        :param phys_clm: the name of the physical columns
        """
        assert isinstance(train_data, pd.DataFrame), "The given data must be an instance of pandas DataFrame"
        assert isinstance(target, str), "Target must be the column name"
        assert isinstance(phys_clm, str), "phys_clm must be a string"
        # assert callable(loss_fn), "{} is not callable".format(loss_fn)

        tr_data = D.Preprocessing(train_data, prep_excluded=self.prep_excluded, scale=self.scale,
                                  prep_included=self.prep_included)
        tr_data.set_features_ordering(self.features_ordering)
        tr_data.fit_transform()
        tr_data = self.d_class(tr_data, **self.d_class_kwargs)
        tr_data = data.DataLoader(tr_data, batch_size=batch_size, shuffle=True, num_workers=4)

        optim = M.get_optimizer(self.predictor, lr=learning_rate, wd=weight_decay)
        if hasattr(loss_fn, "device"):
            loss_fn.device = self.device

        if verbose:
            print("Training predictor")
            for i in tqdm.tqdm(range(epoch)):
                self.__fit__(tr_data=tr_data, sens=sens, target=target, phys_clm=phys_clm, optim=optim, loss_fn=loss_fn)
        else:
            for i in range(epoch):
                self.__fit__(tr_data=tr_data, sens=sens, target=target, phys_clm=phys_clm, optim=optim, loss_fn=loss_fn)
        self.predictor.train(False)

Exemplo n.º 3

0

Exibir arquivo

 def predict(self, test_data, target="sens", sens="sens", phys_clm="phy"):
     """
     Return the prediction, as well as the target groundtruth and the sensitive groundtruth, since some data
      processing has been done
     :param test_data:
     :param target:
     :param sens:
     :param phys_clm:
     :return:
     """
     assert isinstance(test_data, pd.DataFrame), "The given data must be an instance of pandas DataFrame"
     assert isinstance(target, str), "Target must be the column name"
     assert isinstance(phys_clm, str), "phys_clm must be a string"
     ts_data = D.Preprocessing(test_data, prep_excluded=self.prep_excluded, scale=self.scale,
                               prep_included=self.prep_included)
     ts_data.set_features_ordering(self.features_ordering)
     ts_data.fit_transform()
     ts_data = self.d_class(ts_data, **self.d_class_kwargs)
     ts_data = data.DataLoader(ts_data, batch_size=ts_data.sensor.shape[0], shuffle=False, num_workers=4)
     # Single loop since the batch size correspond to the test set size
     for sample in ts_data:
         # put each of the batch objects on the device
         x = sample['sensor'].to(self.device)
         p = sample[phys_clm].to(self.device)
         s = sample[sens]
         t = sample[target]
         # u = sample["uid"].to(device)
         # y = sample['act'].unsqueeze(1).to(device)
         yp = self.predictor(x, p).argmax(1)
     try:
         return {self.__class__.__name__: yp.data.numpy()}, s.data.numpy(), t.data.numpy()
     except TypeError:
         return {self.__class__.__name__: yp.cpu().data.numpy()}, s.data.numpy(), t.data.numpy()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: Sanitization.py Projeto: dr-dahou-adrar/DySan

def sanitization_generation_metrics(feature_order=None, alpha_=P.Alpha, lambda_=P.Lambda, san_loss=P.SanLoss, pred_loss=P.PredLoss,
                                    disc_loss=P.DiscLoss, max_epoch=P.Epoch, k_pred=P.KPred, k_disc=P.KDisc, scale=P.Scale):

    # Return models and datasets

    # Take the first 70% timestep as training.
    train_prep = D.Preprocessing(P.TrainPath, prep_excluded=P.PreprocessingExcluded, scale=P.Scale,
                                 prep_included=P.PreprocessingIncluded)
    train_prep.set_features_ordering(feature_order)
    train_prep.fit_transform()
    test_prep = D.Preprocessing(P.TestPath, prep_excluded=P.PreprocessingExcluded, scale=P.Scale,
                                prep_included=P.PreprocessingIncluded)
    test_prep.set_features_ordering(feature_order)
    test_prep.fit_transform()
    train_ds = D.MotionSenseDataset(train_prep, window_overlap=P.Window_overlap)
    test_ds = D.MotionSenseDataset(test_prep, window_overlap=P.Window_overlap)

    # Shape of unique values
    uniq_act = np.unique(train_ds.activities)
    uniq_sens = np.unique(train_ds.sensitive)
    uniq_uid = np.unique(train_ds.users_id)
    phys_cols = train_ds.phy_data.shape[1]
    try:
        act_cols = train_ds.activities.shape[1]
    except IndexError:
        act_cols = 1

    # Discriminator target
    disc_target_values = uniq_sens
    pred_target_values = uniq_act

    # Load dataset
    # Create dataloader
    # build data loaders
    batch_size = P.BatchSize
    s_train_dl = data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
    d_train_dl = data.DataLoader(train_ds.copy(True), batch_size=batch_size, shuffle=True, num_workers=4)
    d_dl_iter = iter(d_train_dl)
    p_train_dl = data.DataLoader(train_ds.copy(True), batch_size=batch_size, shuffle=True, num_workers=4)
    p_dl_iter = iter(p_train_dl)

    # Create models:

    sanitizer = M.SanitizerConv(input_channels=train_ds.input_channels, seq_len=train_ds.seq_len, kernel_sizes=[5, 5],
                                strides=[1, 1], conv_paddings=[0, 0], phyNodes=phys_cols, noiseNodes=P.NoiseNodes,
                                actNodes=act_cols)

    # Adding physio data can prevent the sensor information to be dependent of such attribute because the disc model
    # Can not predict the sensitive value even though the height weight and other are given. Or we know that if an
    # attribute is strongly correlated, then the model will find such correlation. Example: Create a model to predict
    # something and in train set, give the target as data input the predict the same target. The model will learn to dis-
    # regard other columns
    # Predictor model output should be of same shape as necessary for NLLLoss. (model output is a matrix while target is
    # a vector).
    def get_models(input_channels=train_ds.input_channels, seq_len=train_ds.seq_len,
                   pred_out_size=pred_target_values.shape[0], disc_out_size=disc_target_values.shape[0],
                   phys_cols=phys_cols, act_cols=act_cols):
        predictor = M.PredictorConv(input_channels=input_channels, seq_len=seq_len, output_size=pred_out_size,
                                    physNodes=phys_cols)
        #M.load_classifier_state2(predictor, "Predictor")
        predictor.to(DEVICE)
        pred_optim = M.get_optimizer(predictor,)

        discriminator = M.DiscriminatorConv(input_channels=input_channels, seq_len=seq_len, output_size=disc_out_size,
                                            physNodes=phys_cols+act_cols)
        #M.load_classifier_state2(discriminator,"Discriminator")
        discriminator.to(DEVICE)
        disc_optim = M.get_optimizer(discriminator,)
        return predictor, pred_optim, discriminator, disc_optim

    def reset_weights(m):
        try:
            m.reset_parameters()
        except AttributeError as e:
            pass
            # print(e)
            # print("Layer not affected")

    predictor, pred_optim, discriminator, disc_optim = get_models()
    # Send models on GPU or CPU
    sanitizer.to(DEVICE)

    # Check the latest Epoch to start sanitization
    start_epoch = M.get_latest_states(P.ModelsDir(), sanitizer, discriminator, predictor, otherParamFn=P.ParamFunction)

    # Initialise losses
    san_loss = Cl.SanitizerBerLoss(alpha_=alpha_, lambda_=lambda_, recOn=P.RecOn, optim_type=P.OptimType, device=DEVICE)
    # pred_loss = Cl.AccuracyLoss(device=DEVICE)
    pred_loss = Cl.BalancedErrorRateLoss(targetBer=0, device=DEVICE)
    disc_loss = Cl.BalancedErrorRateLoss(targetBer=0, device=DEVICE)

    # Optimizers
    san_optim = M.get_optimizer(sanitizer,)

    losses_frame_path = "{}/{}.csv".format(P.ModelsDir(), P.ParamFunction("losses"))
    san_losses = [
        [], [], []
    ]
    disc_losses = []
    pred_losses = []
    if (start_epoch > 1) and tryReading(losses_frame_path):
        losses_frame = pd.read_csv(losses_frame_path)
        disc_losses = losses_frame["disc"].values.tolist()
        pred_losses = losses_frame["pred"].values.tolist()
        san_losses = losses_frame.drop(["pred", "disc"], axis=1).T.values.tolist()

    # Function to differentiate and integrate the activities. (Ignore for the predictor, integrate for the sanitizer)
    act_fn_disc = lambda ps, act: torch.cat((ps, act*P.DecorrelateActAndSens), 1)
    act_fn_pred = lambda ps, act: ps

    # Init figure
    fig = "asdfoijbnad"
    plt.figure(fig, figsize=(14, 14))

    # Sanitize
    print("Starting Sanitizing ......>")
    for epoch in tqdm.tqdm(range(start_epoch, max_epoch+1)):
        print("Current Epoch: {}".format(epoch))
        if P.TrainingResetModelsStates:
            predictor.apply(reset_weights)
            discriminator.apply(reset_weights)

            # del predictor
            # del discriminator
            # del disc_optim
            # del pred_optim
            # predictor, pred_optim, discriminator, disc_optim = get_models()

        for sample in s_train_dl:

            # Train the sanitizer
            l = train_sanitizer(sample, sanitizer, discriminator, predictor, san_loss,
                                           san_optim, act_fn=act_fn_disc, act_select=P.ActivitySelection,
                                           phys_select=P.PhysiologSelection, phys=P.PhysInput,
                                           san_acts=P.SanitizeActivities,)
            san_losses[0].append(l[0].mean().to(CPU_DEVICE).data.numpy().reshape(-1)[0])
            san_losses[1].append(l[1].to(CPU_DEVICE).data.numpy().reshape(-1)[0])
            san_losses[2].append(l[2].to(CPU_DEVICE).data.numpy().reshape(-1)[0])

            # Train the predictor
            l, p_dl_iter = train_predictor(pred_losses,k_pred, sanitizer, predictor, p_train_dl, p_dl_iter, pred_loss, pred_optim,
                                           act_fn=act_fn_pred, act_select=P.ActivitySelection,
                                           phys_select=P.PhysiologSelection, target_key="act",
                                           sens_key="sens", phys=P.PhysInput, san_acts=P.SanitizeActivities,)
            pred_losses.append(l.to(CPU_DEVICE).data.numpy().reshape(-1)[0])
            #pred_losses.append(1)
            # Train the discriminator
            l, d_dl_iter = train_predictor(disc_losses,k_pred, sanitizer, discriminator, d_train_dl, d_dl_iter, disc_loss, disc_optim,
                                           act_fn=act_fn_disc, act_select=P.ActivitySelection,
                                           phys_select=P.PhysiologSelection, target_key="sens",
                                           sens_key="sens", phys=P.PhysInput, san_acts=P.SanitizeActivities,)
            disc_losses.append(l.to(CPU_DEVICE).data.numpy().reshape(-1)[0])
            #disc_losses.append(1)

        print("***")
        # Save losses, and models states.
        # Saving models States.
        M.save_classifier_states(sanitizer, epoch, P.ModelsDir(), otherParamFn=P.ParamFunction, ext="S")
        M.save_classifier_states(discriminator, epoch, P.ModelsDir(), otherParamFn=P.ParamFunction, ext="D")
        M.save_classifier_states(predictor, epoch, P.ModelsDir(), otherParamFn=P.ParamFunction, ext="P")
        # Saving and plotting losses
        losses_frame = pd.DataFrame.from_dict({
            "san_rec": san_losses[0], "san_act": san_losses[1], "san_sens": san_losses[2],
            "disc": disc_losses, "pred": pred_losses,
        })
        losses_frame.to_csv(losses_frame_path, index=False)
        losses_frame["san_sens"] = san_loss.disc_loss.get_true_value(losses_frame["san_sens"].values)
        if epoch % P.PlotRate == 0:
            plt.subplot(5, 1, 1)
            sns.lineplot(x="index", y="san_rec", data=losses_frame.reset_index())
            plt.subplot(5, 1, 2)
            sns.lineplot(x="index", y="san_act", data=losses_frame.reset_index())
            plt.subplot(5, 1, 3)
            sns.lineplot(x="index", y="san_sens", data=losses_frame.reset_index())
            plt.subplot(5, 1, 4)
            sns.lineplot(x="index", y="disc", data=losses_frame.reset_index())
            plt.subplot(5, 1, 5)
            sns.lineplot(x="index", y="pred", data=losses_frame.reset_index())
            plt.savefig("{}/{}.png".format(P.FiguresDir(), P.ParamFunction("losses")))
            plt.clf()
        

    # Check datasets and generate
# def generate_dataset(san, train_prep, train_ds, train_dl, test_prep, test_ds, test_dl, gen_path, train_id="train",
#                      test_id="test", max_epoch=0, addParamFn=None, phys=1, san_acts=1, san_phys=1):
    
    print("Generating Sanitized Datasets")
    generate_dataset(sanitizer, train_prep, train_ds, test_prep, test_ds, P.GenDataDir(), max_epoch=P.Epoch,
                     addParamFn=P.ParamFunction, phys=P.PhysInput, san_acts=P.SanitizeActivities,
                     san_phys=P.SanitizePhysio)
    # Check if everything has been correctly generated

    #print("Computing Metrics")
    # If device == cpu_device, then we are not supposed to use gpu as there might not be anyone
    """metrics_computation(input_channels=train_ds.input_channels, seq_len=train_ds.seq_len,

Exemplo n.º 5

0

Exibir arquivo

# Store all hyperparameters in a dictionary
pars = Parameters.getParameters()

# Print a recap of the selected hyperparameters
print( "========== Chosen parameters ==========" )
print( pars )
time.sleep(1.5)

# Compute the size of the minibatches
sizeMB = int( pars['sizeTrain']/ pars['nMB'] )
sizeTot = int( np.ceil( (1.0+pars['ratioTest'])*pars['sizeTrain'] ) )

############### Build or read the dataset
if pars['dataset'] == 'MNIST':         
        dataset = dt.readMNIST( sizeTot )
elif pars['dataset'] == 'CAL':
         X_train, X_test = dt.readCAL()
         dataset = np.vstack( (X_train, X_test) )
elif pars['dataset'] == 'GEP':
        dataset = dt.buildGEP( pars['N'], pars['l'], sizeTot, pars['seedTr'],\
                               pars['p_01'], pars['p_10'], pars['invert'] )
elif pars['dataset'] == 'SB':
        dataset = dt.buildSB( pars['N'], pars['l'], sizeTot, pars['seedTr'] ) 
elif pars['dataset'] == 'BES':
        dataset = dt.buildBES( pars['N'], sizeTot,  pars['seedTr'] )
            
# Add a dimension to handle biases
dataset = np.insert( dataset, 0, 1, axis = 1)

# Use hold-out technique to avoid overfitting

Exemplo n.º 6

0

Exibir arquivo

Arquivo: Analysis.py Projeto: dr-dahou-adrar/DySan

    neural_network.MLPClassifier(random_state=seed),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(n_estimators=100,
                                    max_depth=10,
                                    random_state=seed),
    linear_model.LogisticRegression(class_weight='balanced',
                                    solver='liblinear')
]
numberClass = len(tc)
names = [
    'GradientBoostingClassifier', 'MLPClassifier', 'DecisionTreeClassifier',
    'RandomForestClassifier', 'LogisticRegression'
]

train_prep = D.Preprocessing(originaleTrain,
                             prep_excluded=P.PreprocessingExcluded,
                             scale=P.Scale,
                             prep_included=P.PreprocessingIncluded)
train_prep.set_features_ordering(None)
test_prep = D.Preprocessing(originaleTest,
                            prep_excluded=P.PreprocessingExcluded,
                            scale=P.Scale,
                            prep_included=P.PreprocessingIncluded)
test_prep.set_features_ordering(None)
test_prep.fit_transform()
train_ds = D.MotionSenseDataset(train_prep)
test_ds = D.MotionSenseDataset(test_prep)
train = train_ds.__inverse_transform_conv__(sensor_tensor=train_ds.sensor,
                                            phy=train_ds.phy_data,
                                            act_tensor=train_ds.activities,
                                            sens_tensor=train_ds.sensitive,
                                            user_id_tensor=train_ds.users_id,

Exemplo n.º 7

0

Exibir arquivo

import torch
import numpy as np
from torch.utils import data
from Modules import Datasets
from Modules import Models

# Add timing
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
torch.cuda.empty_cache()

# Take the first 70% timestep as training.
train_ds = Datasets.MotionSenseDataset("../../Data/Csv/motion-sense-train.csv")
test_ds = Datasets.MotionSenseDataset("../../Data/Csv/motion-sense-test.csv")

# build data loaders
batch_size = 256
# Tester randomSampler to see if it only shuffle indices and not content
train_dl = data.DataLoader(train_ds,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=4)
test_dl = data.DataLoader(test_ds,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=4)

# Defining model Predicting activities.
activities = np.unique(train_ds.activities)
phys_shape = train_ds.phy_data.shape[1]
model = Models.SanitizerConv(input_channels=train_ds.input_channels,

Exemplo n.º 8

0

Exibir arquivo

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils import data
from Modules import ModelsFinal as M
from Modules import Datasets as D
from Modules import Parameters as P
from Modules import CustomLosses as Cl

# Add timing
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
torch.cuda.empty_cache()

# Take the first 70% timestep as training.
train_prep = D.Preprocessing("./" + P.TrainPath, prep_excluded=P.PreprocessingExcluded, scale=P.Scale,
                             prep_included=P.PreprocessingIncluded)
train_prep.set_features_ordering(None)
#train_prep.fit_transform() #ici, si on met en commentaire, on a la prediction sur les donnees brutes
test_prep = D.Preprocessing("./"+ P.TestPath, prep_excluded=P.PreprocessingExcluded, scale=P.Scale,
                            prep_included=P.PreprocessingIncluded)
test_prep.set_features_ordering(None)
test_prep.fit_transform()
train_ds = D.MotionSenseDataset(train_prep)
test_ds = D.MotionSenseDataset(test_prep)
# train_ds = Datasets.MotionSenseDataset("../../Data/Csv/motion-sense-train.csv")
# test_ds = Datasets.MotionSenseDataset("../../Data/Csv/motion-sense-test.csv")


# build data loaders
batch_size = 256
# Tester randomSampler to see if it only shuffle indices and not content