Пример #1
0
def preprocess_sachs(folder_path,
                     save_path,
                     num_envs,
                     normalization='standard'):
  """Preprocesses Sachs.

  Args:
    folder_path: Read sachs from the path
    save_path: Save data in this path
    num_envs: The number of environments to cluster the data into
    normalization: Normalization option

  Returns:
    The sachs dataset with different envs [num_envs, number of sample in
    each envs, num of features]

  """

  np.set_printoptions(precision=3)
  X = read_sachs_all(folder_path)
  _, d = X.shape
  kmeans = KMeans(n_clusters=num_envs, max_iter=1000).fit(X)
  labeled_X = kmeans.fit_predict(X)
  X_envs = utils.classify_x(X, labeled_X)  # X_cluster is a dict
  X, Y = utils.preprocess_labeled_data(X, labeled_X, normalization)
  X_res, Y_res = utils.over_sampling(X, Y)
  X_envs = utils.classify_x(X_res, Y_res)

  os.makedirs(save_path, exist_ok=True)
  for i, X in X_envs.items():
    exp = save_path + f'sachs_env_{i+1}_{num_envs}.csv'
    if X.shape[0] > 1:
      np.savetxt(exp, X, delimiter=',')

  return utils.distribute_to_envs(X_envs)
Пример #2
0
def preprocess_BH(save_path, cluster):
  """Clusters the data and prerpocess it.

  Args:
    save_path: the path to save the proprocessed data
    cluster: number of clusters to cluster the dataset

  Returns:
    A tensor of [num_envs, number of sample in each envs, num of features]
  """
  np.set_printoptions(precision=3)

  x_raw = utils.load_BH()
  label = np.ones(len(x_raw))

  # only train and test
  x, x_test, _, _ = train_test_split(
      x_raw, label, test_size=0.1, random_state=42)

  # train val and test
  # X_train_val, X_test, _, _ = train_test_split(
  #  X_raw, label, test_size=0.1, random_state=42)
  # X, X_val, _, _ = train_test_split(
  # X_train_val, np.ones(len(X_train_val)), test_size=0.1, random_state=1)

  n, d = x.shape
  kmeans = KMeans(n_clusters=cluster, max_iter=1000).fit(x)
  labeled_x = kmeans.fit_predict(x)
  x_cluster = utils.classify_x(x, labeled_x)  # X_cluster is a dict

  os.makedirs(save_path, exist_ok=True)

  x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x)
  x_envs = utils.classify_x(x_upsampled, y_upsampled)
  standard_scaler = preprocessing.StandardScaler()

  x_train = list()
  i = 1

  # Save the data for different envs
  for x_env in x_envs.items():
    standardx = standard_scaler.fit_transform(x_env[1])
    exp = save_path + f'standard_BH_env_{i}_{cluster}.csv'
    np.savetxt(exp, standardx, fmt='%.3f', delimiter=',')
    i += 1
    x_train.append(standardx)

  # Standard the train test dataset using the mean and std of the train dataset
  standard_train_param = standard_scaler.fit(x)
  standardxtrain = standard_scaler.transform(x)
  x_trainexp = save_path + f'standard_BH_train.csv'
  np.savetxt(x_trainexp, standardxtrain, fmt='%.3f', delimiter=',')

  standardxtest = standard_scaler.transform(x_test)
  x_testexp = save_path + f'standard_BH_test.csv'
  np.savetxt(x_testexp, standardxtest, fmt='%.3f', delimiter=',')

  return np.stack(x_train), x_testexp
Пример #3
0
def cluster_Insurance(x, num_cluster):
  """Clusters and returns the data clustered into different environments.

  Args:
   x: the data to be clustered
   num_cluster: the number of clusters

  Returns:
    np.array: the data clustered into different environments
  """
  _, d = x.shape
  kmeans = KMeans(n_clusters=num_cluster, max_iter=1000).fit(x)
  labeled_x = kmeans.fit_predict(x)
  x_cluster = utils.classify_x(x, labeled_x)  # X_cluster is a dict
  x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x)
  x_envs = utils.classify_x(x_upsampled, y_upsampled)

  return x_envs
Пример #4
0
def read_sachs_to_envs(folder_path, num_envs, normalization):
  """Loads Sachs data and return sachs data divided into different environments.

  Args:
    folder_path:
    num_envs: the number of envs to read the data
    normalization: normalization type

  Returns:
    A tensor with shape [num_envs, number of sample in each envs, num of
    features]
  """
  sachs_data = list()

  if num_envs == 14:

    y_label = []
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()
      sachs_array = utils.preprocess(sachs_array, normalization)
      sachs_data.append(sachs_array)
      y_label.append(np.ones(sachs_array.shape[0]) * i)

    sachs_data_envs = np.vstack(sachs_data)
    sachs_data_labels = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(sachs_data_envs, sachs_data_labels)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  elif num_envs == 2:

    X_envs = [None] * num_envs
    y_label = [None] * num_envs
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      start_index = file.index('sachs_data/') + 11
      end_index = file.index(' ') - 1
      file_index = int(file[start_index:end_index])
      label = 0 if file_index <= 9 else 1

      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()
      if X_envs[label] is None:
        X_envs[label] = sachs_array
        y_label[label] = np.ones(sachs_array.shape[0]) * label
      else:
        X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0)
        y_label[label] = np.concatenate(
            (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0)

    for i in range(num_envs):
      X_envs[i], y_label[i] = utils.preprocess_labeled_data(
          X_envs[i], y_label[i], normalization)

    X = np.vstack(X_envs)
    Y = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(X, Y)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  elif num_envs == 3:

    if not os.path.exists(f'./data/cluster/*3.csv'):
      X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 3,
                                normalization)
    else:
      for file in glob.glob(f'./data/cluster/*3.csv'):
        sachs_array = np.loadtxt(file, delimiter=',')
        # sachs_array = preprocess(sachs_array, args=args)
        sachs_data.append(sachs_array)
        X_envs = np.stack(sachs_data)

  elif num_envs == 6:

    if not os.path.exists(f'./data/cluster/*3.csv'):
      X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 6,
                                normalization)
    else:
      for file in glob.glob(f'./data/cluster/*6.csv'):
        sachs_array = np.loadtxt(file, delimiter=',')
        sachs_array = utils.preprocess(sachs_array, normalization)
        sachs_data.append(sachs_array)
        X_envs = np.stack(sachs_data)

  elif num_envs == 7:

    X_envs = [None] * num_envs
    y_label = [None] * num_envs
    for i, file in enumerate(glob.glob(f'{folder_path}*.xls')):
      start_index = file.index('sachs_data/') + 11
      end_index = file.index(' ') - 1
      file_index = int(file[start_index:end_index])
      label = file_index % num_envs

      sachs_df = pd.read_excel(file)
      sachs_array = sachs_df.to_numpy()

      if X_envs[label] is None:
        X_envs[label] = sachs_array
        y_label[label] = np.ones(sachs_array.shape[0]) * label
      else:
        X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0)
        y_label[label] = np.concatenate(
            (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0)

    for i in range(num_envs):
      X_envs[i], y_label[i] = utils.preprocess_labeled_data(
          X_envs[i], y_label[i], normalization)

    X = np.vstack(X_envs)
    Y = np.hstack(y_label)
    X_res, Y_res = utils.over_sampling(X, Y)
    X_cluster = utils.classify_x(X_res, Y_res)
    X_envs = utils.distribute_to_envs(X_cluster)

  return X_envs