示例#1
0
def create_path_and_folders(nb_devices: int,
                            dataset: str,
                            iid: str,
                            algos: str,
                            fraction_sampled_workers: int = 1,
                            model_name: str = None):
    if model_name is not None:
        foldername = "{0}-{1}-N{2}/{3}".format(dataset, iid, nb_devices,
                                               model_name)
    else:
        foldername = "{0}-{1}-N{2}".format(dataset, iid, nb_devices)
    picture_path = "{0}/pictures/{1}/{2}".format(get_project_root(),
                                                 foldername, algos)
    if fraction_sampled_workers != 1:
        picture_path += "/pp-{0}".format(fraction_sampled_workers)
    # Contains the pickle of the dataset
    data_path = "{0}/pickle".format(get_path_to_pickle(), foldername)
    # Contains the pickle of the minimum objective.
    pickle_path = "{0}/{1}".format(data_path, foldername)
    # Contains the pickle of the gradient descent for each kind of algorithms.
    algos_pickle_path = "{0}/{1}".format(pickle_path, algos)
    if fraction_sampled_workers != 1:
        algos_pickle_path += "/pp-{0}".format(fraction_sampled_workers)

    # Create folders for pictures and pickle files
    create_folder_if_not_existing(algos_pickle_path)
    create_folder_if_not_existing(picture_path)
    return data_path, pickle_path, algos_pickle_path, picture_path
示例#2
0
def prepare_quantum(nb_devices: int, data_path: str, pickle_path: str, iid: bool = True, double_check: bool =False):
    data = pd.read_csv('{0}/dataset/quantum/phy_train.csv'.format(get_project_root()), sep="\t", header=None)

    # Looking for missing values.
    columns_with_missing_values = []
    for col in range(1, len(data.columns)):
        if (not data[data[col] == 999].empty) or (not data[data[col] == 9999].empty):
            columns_with_missing_values.append(col)
    logging.debug("Following columns has missing values:", columns_with_missing_values)
    data.drop(data.columns[columns_with_missing_values], axis=1, inplace=True)
    logging.debug("The columns with empty values have been removed.")
    data = data.rename(columns={0: "ID", 1: "state", 80: "nothing"})
    data = data.drop(['ID', 'nothing'], axis=1)
    data.head()

    # Looking for empty columns (with null std).
    small_std = []
    std_data = data.std()
    for i in range(len(data.columns)):
        if std_data.iloc[i] < 1e-5:
            small_std.append(i)
    logging.debug("This columns are empty: {0}".format(small_std))
    data.iloc[:, small_std].describe()

    # Removing columns with null std
    data = data.loc[:, (data.std() > 1e-6)]
    dim = len(data.columns) - 1 # The dataset still contains the label
    logging.debug("Now, there is " + str(dim) + " dimensions.")

    data = data.replace({'state': {0: -1}})

    logging.debug("Head of the dataset (columns has not been re-indexed).")
    logging.debug(data.head())

    logging.debug("Labels repartition:")
    logging.debug(data['state'].value_counts())

    logging.debug("Scaling data.")
    scaled_data = scale(data.loc[:, data.columns != "state"])
    X_data = pd.DataFrame(data=scaled_data, columns=data.loc[:, data.columns != "state"].columns)
    Y_data = data.loc[:, data.columns == "state"] # We do not scale labels (+/-1).
    # Merging dataset in one :
    data = pd.concat([X_data, Y_data], axis=1, sort=False)

    if iid:
        # Transforming into torch.FloatTensor
        X_merged = torch.tensor(X_data.to_numpy(), dtype=torch.float64)
        Y_merged = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(data, "state", data_path + "/quantum", pickle_path, nb_devices, double_check)

    return X, Y, dim + 1 # Because we added one column for the bias
示例#3
0
    def __init__(self, train=True, iid: str = "iid"):
        root = get_project_root()
        bool_iid = True if iid == "iid" else False

        create_folder_if_not_existing("{0}/pickle/a9a-{1}-N20".format(
            root, iid))
        X_train, Y_train, dim_notebook = prepare_a9a(
            20,
            data_path="{0}/pickle/".format(root),
            pickle_path="{0}/pickle/a9a-{1}-N20".format(root, iid),
            iid=bool_iid,
            test=False)

        self.split = []
        last_idx = 0
        for y in Y_train:
            self.split.append(np.array(range(last_idx, last_idx + len(y))))

        X_train = torch.cat([x for x in X_train])
        Y_train = torch.cat([y.reshape(len(y), 1) for y in Y_train])

        for i in range(len(Y_train)):
            if Y_train[i] == -1:
                Y_train[i] = 0

        X_test, Y_test, dim_notebook = prepare_a9a(
            20,
            data_path="{0}/pickle/".format(root),
            pickle_path="{0}/pickle/a9a-{1}-N20".format(root, iid),
            iid=True,
            test=True)

        X_test = torch.cat([x for x in X_test])
        Y_test = torch.cat([y.reshape(len(Y_test[0]), 1) for y in Y_test])

        for i in range(len(Y_test)):
            if Y_test[i] == -1:
                Y_test[i] = 0

        self.train = train
        if self.train:
            print('Total number of point:', len(X_train))
            self.data = X_train
            self.targets = Y_train
        else:
            self.data = X_test
            self.targets = Y_test
示例#4
0
    def __init__(self, train=True):
        root = get_project_root()
        create_folder_if_not_existing(
            "{0}/pickle/quantum-non-iid-N21".format(root))
        X, Y, dim_notebook = prepare_quantum(
            20,
            data_path="{0}/pickle/".format(root),
            pickle_path="{0}/pickle/quantum-non-iid-N20".format(root),
            iid=False)
        for y in Y:
            for i in range(len(y)):
                if y[i].item() == -1:
                    y[i] = 0
                else:
                    y[i] = 1

        test_data, test_labels = [], []
        eval_data, eval_labels = [], []
        last_index = 0
        split = []
        for i in range(len(X)):
            x, y = X[i], Y[i]
            n = int(len(x) * 10 / 100)
            test_data += x[:n]
            test_labels += y[:n]
            eval_data += x[n:2 * n]
            eval_labels += y[n:2 * n]
            X[i], Y[i] = X[i][n:], Y[i][n:]
            split.append(list(range(last_index, last_index + len(X[i]))))
            last_index += len(X[i])

        self.train = train
        if self.train:
            self.data = eval_data + list(itertools.chain.from_iterable(X[:20]))
            self.labels = eval_labels + list(
                itertools.chain.from_iterable(Y[:20]))
            self.ind_val = len(eval_data)
            self.split = [[s[i] + len(eval_data) for i in range(len(s))]
                          for s in split]
        else:
            self.data = test_data
            self.labels = test_labels
示例#5
0
    def __init__(self, train=True, iid: str = "iid"):
        root = get_project_root()
        bool_iid = True if iid == "iid" else False

        create_folder_if_not_existing("{0}/pickle/mushroom-{1}-N20".format(
            root, iid))
        X_train, Y_train, dim_notebook = prepare_mushroom(
            20,
            data_path="{0}/pickle/".format(root),
            pickle_path="{0}/pickle/mushroom-{1}-N20".format(root, iid),
            iid=bool_iid)

        self.split = []
        last_idx = 0
        for y in Y_train:
            self.split.append(np.array(range(last_idx, last_idx + len(y))))

        X_train = torch.cat([x for x in X_train])
        Y_train = torch.cat([y.reshape(len(y), 1) for y in Y_train])

        for i in range(len(Y_train)):
            if Y_train[i] == -1:
                Y_train[i] = 0

        n = int(len(X_train) * 10 / 100)

        # Warning: Here the goal is to obtain the same result as without Neural Network.
        # Thus, the train set contains the whole dataset. The test set is included into the train set.
        test_data = X_train[:n]
        test_labels = Y_train[:n]

        self.train = train
        if self.train:
            print('Total number of point:', len(X_train))
            self.data = X_train
            self.targets = Y_train
        else:
            self.data = test_data
            self.targets = test_labels
示例#6
0
def prepare_superconduct(nb_devices: int, data_path: str, pickle_path: str, iid: bool = True, double_check: bool = False):
    data = pd.read_csv('{0}/dataset/superconduct/train.csv'.format(get_project_root()), sep=",")
    if data.isnull().values.any():
        logging.warning("There is missing value.")
    else:
        logging.debug("No missing value. Great !")
    logging.debug("Scaling data.")
    scaled_data = scale(data)
    data = pd.DataFrame(data=scaled_data, columns = data.columns)
    X_data = data.loc[:, data.columns != "critical_temp"]
    Y_data = data.loc[:, data.columns == "critical_temp"]
    dim = len(X_data.columns)
    logging.debug("There is " + str(dim) + " dimensions.")

    logging.debug("Head of the dataset:")
    logging.debug(data.head())

    if iid:
        X_merged = torch.tensor(X_data.to_numpy(), dtype=torch.float64)
        Y_merged = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(data, "critical_temp", data_path + "/superconduct", pickle_path, nb_devices, double_check)
    return X, Y, dim + 1 # Because we added one column for the bias
示例#7
0
def get_path_to_datasets() -> str:
    """Return the path to the datasets. For sake of anonymization, the path to datasets on clusters is not keep on
    GitHub and must be personalized locally"""
    return get_project_root()
示例#8
0
def get_path_to_pickle() -> str:
    """"Return the path to the pickle folder. """
    return get_project_root()