def create_path_and_folders(nb_devices: int, dataset: str, iid: str, algos: str, fraction_sampled_workers: int = 1, model_name: str = None): if model_name is not None: foldername = "{0}-{1}-N{2}/{3}".format(dataset, iid, nb_devices, model_name) else: foldername = "{0}-{1}-N{2}".format(dataset, iid, nb_devices) picture_path = "{0}/pictures/{1}/{2}".format(get_project_root(), foldername, algos) if fraction_sampled_workers != 1: picture_path += "/pp-{0}".format(fraction_sampled_workers) # Contains the pickle of the dataset data_path = "{0}/pickle".format(get_path_to_pickle(), foldername) # Contains the pickle of the minimum objective. pickle_path = "{0}/{1}".format(data_path, foldername) # Contains the pickle of the gradient descent for each kind of algorithms. algos_pickle_path = "{0}/{1}".format(pickle_path, algos) if fraction_sampled_workers != 1: algos_pickle_path += "/pp-{0}".format(fraction_sampled_workers) # Create folders for pictures and pickle files create_folder_if_not_existing(algos_pickle_path) create_folder_if_not_existing(picture_path) return data_path, pickle_path, algos_pickle_path, picture_path
def prepare_quantum(nb_devices: int, data_path: str, pickle_path: str, iid: bool = True, double_check: bool =False): data = pd.read_csv('{0}/dataset/quantum/phy_train.csv'.format(get_project_root()), sep="\t", header=None) # Looking for missing values. columns_with_missing_values = [] for col in range(1, len(data.columns)): if (not data[data[col] == 999].empty) or (not data[data[col] == 9999].empty): columns_with_missing_values.append(col) logging.debug("Following columns has missing values:", columns_with_missing_values) data.drop(data.columns[columns_with_missing_values], axis=1, inplace=True) logging.debug("The columns with empty values have been removed.") data = data.rename(columns={0: "ID", 1: "state", 80: "nothing"}) data = data.drop(['ID', 'nothing'], axis=1) data.head() # Looking for empty columns (with null std). small_std = [] std_data = data.std() for i in range(len(data.columns)): if std_data.iloc[i] < 1e-5: small_std.append(i) logging.debug("This columns are empty: {0}".format(small_std)) data.iloc[:, small_std].describe() # Removing columns with null std data = data.loc[:, (data.std() > 1e-6)] dim = len(data.columns) - 1 # The dataset still contains the label logging.debug("Now, there is " + str(dim) + " dimensions.") data = data.replace({'state': {0: -1}}) logging.debug("Head of the dataset (columns has not been re-indexed).") logging.debug(data.head()) logging.debug("Labels repartition:") logging.debug(data['state'].value_counts()) logging.debug("Scaling data.") scaled_data = scale(data.loc[:, data.columns != "state"]) X_data = pd.DataFrame(data=scaled_data, columns=data.loc[:, data.columns != "state"].columns) Y_data = data.loc[:, data.columns == "state"] # We do not scale labels (+/-1). # Merging dataset in one : data = pd.concat([X_data, Y_data], axis=1, sort=False) if iid: # Transforming into torch.FloatTensor X_merged = torch.tensor(X_data.to_numpy(), dtype=torch.float64) Y_merged = torch.tensor(Y_data.values, dtype=torch.float64) X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices) else: X, Y = prepare_noniid_dataset(data, "state", data_path + "/quantum", pickle_path, nb_devices, double_check) return X, Y, dim + 1 # Because we added one column for the bias
def __init__(self, train=True, iid: str = "iid"): root = get_project_root() bool_iid = True if iid == "iid" else False create_folder_if_not_existing("{0}/pickle/a9a-{1}-N20".format( root, iid)) X_train, Y_train, dim_notebook = prepare_a9a( 20, data_path="{0}/pickle/".format(root), pickle_path="{0}/pickle/a9a-{1}-N20".format(root, iid), iid=bool_iid, test=False) self.split = [] last_idx = 0 for y in Y_train: self.split.append(np.array(range(last_idx, last_idx + len(y)))) X_train = torch.cat([x for x in X_train]) Y_train = torch.cat([y.reshape(len(y), 1) for y in Y_train]) for i in range(len(Y_train)): if Y_train[i] == -1: Y_train[i] = 0 X_test, Y_test, dim_notebook = prepare_a9a( 20, data_path="{0}/pickle/".format(root), pickle_path="{0}/pickle/a9a-{1}-N20".format(root, iid), iid=True, test=True) X_test = torch.cat([x for x in X_test]) Y_test = torch.cat([y.reshape(len(Y_test[0]), 1) for y in Y_test]) for i in range(len(Y_test)): if Y_test[i] == -1: Y_test[i] = 0 self.train = train if self.train: print('Total number of point:', len(X_train)) self.data = X_train self.targets = Y_train else: self.data = X_test self.targets = Y_test
def __init__(self, train=True): root = get_project_root() create_folder_if_not_existing( "{0}/pickle/quantum-non-iid-N21".format(root)) X, Y, dim_notebook = prepare_quantum( 20, data_path="{0}/pickle/".format(root), pickle_path="{0}/pickle/quantum-non-iid-N20".format(root), iid=False) for y in Y: for i in range(len(y)): if y[i].item() == -1: y[i] = 0 else: y[i] = 1 test_data, test_labels = [], [] eval_data, eval_labels = [], [] last_index = 0 split = [] for i in range(len(X)): x, y = X[i], Y[i] n = int(len(x) * 10 / 100) test_data += x[:n] test_labels += y[:n] eval_data += x[n:2 * n] eval_labels += y[n:2 * n] X[i], Y[i] = X[i][n:], Y[i][n:] split.append(list(range(last_index, last_index + len(X[i])))) last_index += len(X[i]) self.train = train if self.train: self.data = eval_data + list(itertools.chain.from_iterable(X[:20])) self.labels = eval_labels + list( itertools.chain.from_iterable(Y[:20])) self.ind_val = len(eval_data) self.split = [[s[i] + len(eval_data) for i in range(len(s))] for s in split] else: self.data = test_data self.labels = test_labels
def __init__(self, train=True, iid: str = "iid"): root = get_project_root() bool_iid = True if iid == "iid" else False create_folder_if_not_existing("{0}/pickle/mushroom-{1}-N20".format( root, iid)) X_train, Y_train, dim_notebook = prepare_mushroom( 20, data_path="{0}/pickle/".format(root), pickle_path="{0}/pickle/mushroom-{1}-N20".format(root, iid), iid=bool_iid) self.split = [] last_idx = 0 for y in Y_train: self.split.append(np.array(range(last_idx, last_idx + len(y)))) X_train = torch.cat([x for x in X_train]) Y_train = torch.cat([y.reshape(len(y), 1) for y in Y_train]) for i in range(len(Y_train)): if Y_train[i] == -1: Y_train[i] = 0 n = int(len(X_train) * 10 / 100) # Warning: Here the goal is to obtain the same result as without Neural Network. # Thus, the train set contains the whole dataset. The test set is included into the train set. test_data = X_train[:n] test_labels = Y_train[:n] self.train = train if self.train: print('Total number of point:', len(X_train)) self.data = X_train self.targets = Y_train else: self.data = test_data self.targets = test_labels
def prepare_superconduct(nb_devices: int, data_path: str, pickle_path: str, iid: bool = True, double_check: bool = False): data = pd.read_csv('{0}/dataset/superconduct/train.csv'.format(get_project_root()), sep=",") if data.isnull().values.any(): logging.warning("There is missing value.") else: logging.debug("No missing value. Great !") logging.debug("Scaling data.") scaled_data = scale(data) data = pd.DataFrame(data=scaled_data, columns = data.columns) X_data = data.loc[:, data.columns != "critical_temp"] Y_data = data.loc[:, data.columns == "critical_temp"] dim = len(X_data.columns) logging.debug("There is " + str(dim) + " dimensions.") logging.debug("Head of the dataset:") logging.debug(data.head()) if iid: X_merged = torch.tensor(X_data.to_numpy(), dtype=torch.float64) Y_merged = torch.tensor(Y_data.values, dtype=torch.float64) X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices) else: X, Y = prepare_noniid_dataset(data, "critical_temp", data_path + "/superconduct", pickle_path, nb_devices, double_check) return X, Y, dim + 1 # Because we added one column for the bias
def get_path_to_datasets() -> str: """Return the path to the datasets. For sake of anonymization, the path to datasets on clusters is not keep on GitHub and must be personalized locally""" return get_project_root()
def get_path_to_pickle() -> str: """"Return the path to the pickle folder. """ return get_project_root()