Пример #1
0
def data_loader(data_name, miss_rate, target_column=None):
    """Loads datasets and introduce missingness.

    Args:
      - data_name: letter, spam, or mnist
      - miss_rate: the probability of missing components

    Returns:
      data_x: original data
      miss_data_x: data with missing values
      data_m: indicator matrix for missing components
    """
    file_name = 'data/' + data_name + '.csv'
    print(file_name)
    data_x = pd.read_csv(file_name, delimiter=',')
    data_x.fillna(0, inplace=True)
    try:
        _ = data_x.pop('datetime')
    except KeyError:
        pass
    train_x, test_x = train_test_split(data_x,
                                       test_size=0.35,
                                       random_state=666,
                                       shuffle=True,
                                       stratify=data_x[target_column].values)
    if target_column is not None:
        train_y = train_x.pop(target_column)
    else:
        train_y = None
    if target_column is not None:
        test_y = test_x.pop(target_column)
    else:
        test_y = None

        # Parameters
    no_train, dim_train = train_x.shape
    no_test, dim_test = test_x.shape

    # Introduce missing data
    data_m_train = binary_sampler(1 - miss_rate, no_train, dim_train)
    data_m_test = binary_sampler(1 - miss_rate, no_test, dim_test)
    miss_train_x = train_x.astype('float32').copy()
    ori_train_x = train_x.astype('float32').copy()
    miss_test_x = test_x.astype('float32').copy()
    ori_test_x = test_x.astype('float32').copy()
    miss_train_x = miss_train_x.values
    miss_test_x = miss_test_x.values
    miss_train_x[data_m_train == 0] = np.nan
    miss_test_x[data_m_test == 0] = np.nan
    miss_train_x = pd.DataFrame(data=miss_train_x, columns=train_x.columns)
    miss_test_x = pd.DataFrame(data=miss_test_x, columns=test_x.columns)
    return (ori_train_x, train_x, train_y, miss_train_x), (ori_test_x, test_x,
                                                           test_y, miss_test_x)
Пример #2
0
def prepare_train_pipeline(norm_train_x, data_m):
    """
    Prepares training Pipeline into a TF Dataset Format
    :param norm_train_x:
    :param data_m:
    :return:
    """
    # Perform all the data augmentation BEFORE the training Loop (ffs)
    rows, columns = norm_train_x.shape
    X_mb = norm_train_x.values
    M_mb = data_m.values

    # Sample random vectors
    Z_mb = uniform_sampler(0, 0.01, rows, columns)
    # Sample hint vectors
    H_mb_temp = binary_sampler(0.9, rows, columns)

    H_mb = M_mb * H_mb_temp

    # Combine random vectors with observed vectors
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    tf_data = tf.data.Dataset.from_tensor_slices(
        (X_mb.astype('float32'), M_mb.astype('float32'),
         H_mb.astype('float32'))).shuffle(100000).batch(256)

    return tf_data
Пример #3
0
def data_loader(data_name, miss_rate):
    '''Loads datasets and introduce missingness.
  
  Args:
    - data_name: letter, spam, or mnist
    - miss_rate: the probability of missing components
    
  Returns:
    data_x: original data
    miss_data_x: data with missing values
    data_m: indicator matrix for missing components
  '''

    # Load data
    if data_name in ['letter', 'spam', 'breast', 'credit', 'news']:
        file_name = 'data/' + data_name + '.csv'
        data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
    elif data_name == 'mnist':
        (data_x, _), _ = mnist.load_data()
        data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float)

    # Parameters
    no, dim = data_x.shape

    # Introduce missing data
    data_m = binary_sampler(1 - miss_rate, no, dim)
    miss_data_x = data_x.copy()
    miss_data_x[data_m == 0] = np.nan

    return data_x, miss_data_x, data_m
Пример #4
0
def data_loader (data_name, miss_rate, onehot, predict):
  '''Loads datasets and introduce missingness.
  
  Args:
    - data_name: the filename of dataset
    - miss_rate: the probability of missing components
    - onehot: the number of feature for onehot encoder (start from first feature)
    - predict: the option of prediction mode
    
  Returns:
    data_x: original data
    miss_data_x: data with missing values
    data_m: indicator matrix for missing components
    feature_name: feature namelist of original data
    onehotencoder: onehotencoder of this data
    ori_data_dim: dimensions of original data
  '''
  
  # Load data
  file_name = 'data/'+data_name+'.csv'
  data = pd.read_csv(file_name)
  feature_name = list(data.columns)
  data = np.array(data)

  # Onehotencoding, if columns have exist missing value, skip encoding
  onehotencoder = OneHotEncoder()
  if np.sum(np.isnan(data[:,:onehot])) == 0 and onehot > 0:
    data_x = data[:,:onehot]
    onehotencoder.fit(data_x)
    data_x = onehotencoder.transform(data_x).toarray()
    data_x = np.concatenate((data_x, data[:,onehot:]),axis=1)
  elif onehot == 0:
    data_x = np.array(data)
  else:
    print("Missing value exist, skip onehotencoding")
    data_x = np.array(data)

  # Parameters
  ori_data_dim = data.shape[1]
  no, dim = data_x.shape
  
  # Introduce missing data
  if predict is False:
    data_m = binary_sampler(1-miss_rate, no, dim)
  else:
    data_m = 1-np.isnan(data_x)
  miss_data_x = data_x.copy()
  miss_data_x[data_m == 0] = np.nan
      
  return data_x, miss_data_x, data_m, feature_name, onehotencoder, ori_data_dim
Пример #5
0
def data_loader(data_name, miss_rate, mechanism):
    '''Loads datasets and introduce missingness.

    Args:
      - data_name: letter, spam, or mnist
      - miss_rate: the probability of missing components

    Returns:
      data_x: original data
      miss_data_x: data with missing values
      data_m: indicator matrix for missing components
    '''

    # Load data
    if data_name in ['letter', 'spam']:
        file_name = 'data/' + data_name + '.csv'
        data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
    elif data_name == 'mnist':
        (data_x, _), _ = mnist.load_data()
        data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float)
    elif data_name == 'breast':
        data_x = load_breast_cancer()['data']
    elif data_name == 'news':
        data_x = np.loadtxt('data/OnlineNewsPopularity1.csv',
                            delimiter=",",
                            skiprows=1)
    elif data_name == 'credit':
        data_x = np.loadtxt('data/default_of_credit_cards_clients.csv',
                            delimiter=",",
                            skiprows=2)
    else:
        raise Exception('Unknown dataset.')

    # Parameters
    no, dim = data_x.shape

    if mechanism == 'mcar':
        # Introduce missing data
        data_m = binary_sampler(1 - miss_rate, no, dim)
    else:
        data_m = 1 - MAR_mask(
            np.array(data_x, dtype=np.float32), p=miss_rate, p_obs=miss_rate)
    miss_data_x = data_x.copy()
    miss_data_x[data_m == 0] = np.nan

    return data_x, miss_data_x, data_m
Пример #6
0
def data_loader(data_name, miss_rate):
    file_name = 'datasets/' + data_name + '.csv'
    complete_data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
    no, dim = complete_data_x.shape
    np.random.shuffle(complete_data_x)

    #Limit the amount of data
    if no > 10000:
        complete_data_x = complete_data_x[0:10000, 0:dim - 1]
        no = 10000
        dim = dim - 1
    else:
        complete_data_x = complete_data_x[0:no, 0:dim - 1]
        dim = dim - 1

    data_m = binary_sampler(1 - miss_rate, no, dim)
    incomplete_data_x = complete_data_x.copy()
    incomplete_data_x[data_m == 0] = np.nan
    return complete_data_x, incomplete_data_x, data_m
Пример #7
0
def data_loader(data_name, miss_rate):
    if data_name in ['spam', 'letter']:
        file_name = 'data/' + data_name + '.csv'
        x = np.loadtxt(file_name, delimiter=",", skiprows=1)
        y = []

    elif data_name in ['spam_full', 'breast_full']:
        file_name = 'data/' + data_name + '.csv'
        x = np.loadtxt(file_name, delimiter=",", skiprows=1)
        print("begin ", x.shape)
        y = np.array([x[-1]])
        y = y.T
        x = np.array([x[:-1]])
        x = x[0, :, :]
        print("x.shape & y", x.shape, y.shape)

    else:
        file_name = 'data/' + data_name + '.arff'
        data, _ = arff.loadarff(file_name)
        data = data.tolist()
        x = np.array([item[:-1] for item in data])
        y = np.array([item[-1] for item in data])
        le = LabelEncoder()
        y = le.fit_transform(y)
        print('Num of classes: ', len(le.classes_))
    # Parameters
    no, dim = x.shape
    print('Num of samples:', no)
    print('Num of features: ', dim)
    print(type(x[0][0]))
    # Introduce missing data
    m = binary_sampler(1 - miss_rate, no, dim)
    miss_x = x.copy()
    miss_x[m == 0] = np.nan

    return x, y, miss_x, m
Пример #8
0
def data_loader(data_name, miss_rate):
    '''Loads datasets and introduce missingness.
  
  Args:
    - data_name: letter, spam, or mnist
    - miss_rate: the probability of missing components
    
  Returns:
    data_x: original data
    miss_data_x: data with missing values
    data_m: indicator matrix for missing components
  '''
    usecols = 0
    # Load data
    if data_name in ['letter', 'spam']:
        file_name = 'data/' + data_name + '.csv'
        data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
    elif data_name == 'breast_original':
        file_name = 'data/' + data_name + '.csv'
        data_x = np.loadtxt(file_name,
                            delimiter=",",
                            usecols=(range(30)),
                            skiprows=1)
        data_y = np.loadtxt(file_name, delimiter=",", usecols=(30), skiprows=1)
        usecols = range(30)
        #print(data_x.shape)
        #print(data_y.shape)
    elif data_name == 'Wine_original':
        file_name = 'data/' + data_name + '.csv'
        data_x = np.loadtxt(file_name,
                            delimiter=",",
                            usecols=(range(12)),
                            skiprows=1)
        data_y = np.loadtxt(file_name, delimiter=",", usecols=(13), skiprows=1)
        usecols = range(12)
    elif data_name == 'mnist':
        (data_x, data_y), _ = mnist.load_data()
        data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float)
        data_y = np.reshape(np.asarray(data_y), [60000, 1]).astype(float)
    elif data_name == 'vals_test_df':
        train_data_name = "vals_train_df.csv"
        file_name = data_name + '.csv'
        data_x = np.loadtxt(file_name,
                            delimiter=",",
                            usecols=(range(1, 10)),
                            skiprows=1)
        data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1)
    elif data_name == 'vals_test_df_test_type1':
        train_data_name = "vals_train_df_test_type1.csv"
        file_name = data_name + '.csv'
        data_x = np.loadtxt(file_name,
                            delimiter=",",
                            usecols=(range(1, 10)),
                            skiprows=1)
        data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1)
    elif data_name == 'vals_test_df_test_type2':
        train_data_name = "vals_train_df_test_type2.csv"
        file_name = data_name + '.csv'
        data_x = np.loadtxt(file_name,
                            delimiter=",",
                            usecols=(range(1, 10)),
                            skiprows=1)
        data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1)
    # Parameters
    no, dim = data_x.shape
    print(data_x.shape)

    # Introduce missing data
    #create missing file name
    filename = "{dname}_Missing_rate_{value}_Index.csv".format(dname=data_name,
                                                               value=miss_rate)
    missing_file_exist = path.exists(filename)
    if missing_file_exist:
        file = open(filename)
        data_m = np.loadtxt(file,
                            delimiter=",",
                            usecols=(range(9)),
                            skiprows=1)
        #print(data_m.shape)
    else:
        data_m = binary_sampler(1 - miss_rate, no, dim)
        #print(data_m.shape)
    #print("datax", data_x.shape)
    miss_data_x = data_x.copy()
    miss_data_x[data_m == 0] = np.nan
    #data_m = binary_sampler(1 - miss_rate, no, dim)
    data_train_x = np.loadtxt(train_data_name,
                              delimiter=",",
                              usecols=(range(1, 10)),
                              skiprows=1)
    data_train_y = np.loadtxt(train_data_name,
                              delimiter=",",
                              usecols=(0),
                              skiprows=1)
    miss_data_x_new = np.concatenate([data_train_x, miss_data_x])

    ### Saving the indexs
    missing_index = pd.DataFrame(data_m)
    missing_index.to_csv(filename, index=False)

    data_x_x = pd.DataFrame(miss_data_x)
    data_y_y = pd.DataFrame(data_y)
    data_x_s = pd.concat([data_y_y, data_x_x], ignore_index=True, axis=1)
    data_x_s.to_csv('{dbname}_generated.csv'.format(dbname=data_name),
                    index=False)

    # print("data_x", data_x.shape)
    # print("miss_data_x_new", miss_data_x_new.shape)
    # print("data_m", data_m.shape)
    # print("data_y", data_y.shape)

    return data_x, miss_data_x_new, data_m, data_y
def cph(data_x, cph_parameters, data_image):
    seed = 25
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - parameters: CPH network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = cph_parameters['batch_size']
    hint_rate = cph_parameters['hint_rate']
    alpha = cph_parameters['alpha']
    iterations = cph_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)
    #print(h_dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    #norm_data_x = np.nan_to_num(norm_data, 0)
    norm_data_x = np.nan_to_num(data_x, 0)

    ## CPH architecture
    # Input placeholders
    X_pre = tf.placeholder(tf.float32, shape=[1, 483, dim, 3])
    # Data vector
    #X = tf.placeholder(tf.float32, shape = [None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    conv_filter_w1 = tf.Variable(tf.random_normal([1, 4, 3, 3]))
    conv_filter_b1 = tf.Variable(tf.random_normal([3]))

    conv_filter_w2 = tf.Variable(tf.random_normal([1, 4, 3, 1]))
    conv_filter_b2 = tf.Variable(tf.random_normal([1]))
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [
        G_W1, G_W2, G_W3, G_b1, G_b2, G_b3, conv_filter_w1, conv_filter_b1,
        conv_filter_w2, conv_filter_b2
    ]

    ## CPH functions
    # CNN + Generator
    def generator(x, m):
        relu_feature_maps1 = tf.nn.relu( \
          tf.nn.conv2d(x, conv_filter_w1, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b1)
        max_pool1 = tf.nn.max_pool(relu_feature_maps1,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        relu_feature_maps2 = tf.nn.relu( \
          tf.nn.conv2d(max_pool1, conv_filter_w2, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b2)
        max_pool2 = tf.nn.max_pool(relu_feature_maps2,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        x2 = tf.reshape(max_pool2, [483, dim])

        # Concatenate Mask and Data
        inputs = tf.concat(values=[x2, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## CPH structure
    # Generator
    G_sample = generator(X_pre, M)
    X2 = X_pre[0, :, :, 0]
    # Combine with observed data
    Hat_X = X2 * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## CPH loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X2 - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## CPH solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        #print(len(batch_idx))
        image_mb = data_image[:, batch_idx, :, :]
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)

        H_mb = M_mb * H_mb_temp
        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        image_mb[0, :, :, 0] = X_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X_pre: image_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X_pre: image_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    image_mb = data_image
    image_mb[0, :, :, 0] = X_mb

    imputed_data = sess.run([G_sample], feed_dict={
        X_pre: image_mb,
        M: M_mb
    })[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    #imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Пример #10
0
def gain(data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - feature_name: feature namelist of original data
    - onehotencoder: onehotencoder of this data
    - ori_data_dim: dimensions of original data    
    - gain_parameters: GAIN network parameters:
      - data_name: the file name of dataset
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      - onehot: the number of feature for onehot encoder (start from first feature)
      - predict: option for prediction mode
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    data_name = gain_parameters['data_name']
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']
    onehot = gain_parameters['onehot']
    predict = gain_parameters['predict']

    # Model Path
    model_path = 'model/' + data_name

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector q
    X = tf.placeholder(tf.float32, shape=[None, dim], name='X')
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim], name='M')
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim], name='H')

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]),
                       name='D_W1')  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b1')

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='D_W2')
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b2')

    D_W3 = tf.Variable(xavier_init([h_dim, dim]), name='D_W3')
    D_b3 = tf.Variable(tf.zeros(shape=[dim]),
                       name='D_b3')  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1')
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1')

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2')
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2')

    G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3')
    G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3')

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    saver = tf.train.Saver()
    if predict is True and os.path.exists(model_path + '.ckpt.meta'):
        print("Model Restore")
        saver.restore(sess, model_path + '.ckpt')
    else:
        sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})
    if predict is False:
        save_path = saver.save(sess, model_path + '.ckpt')

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    # Reverse encoding
    if onehot > 0:
        imputed_data = reverse_encoding(imputed_data, feature_name,
                                        onehotencoder, onehot, ori_data_dim)

    return imputed_data
Пример #11
0
def gain(data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector
    X = tf.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Пример #12
0
def PC_GAIN (incomplete_data_x , gain_parameters , data_m):
    '''Impute missing values in incomplete_data_x

    Args:
    - incomplete_data_x: original data with missing values
    - gain_parameters: PC_GAIN network parameters:
    - batch_size: Batch size,64
    - hint_rate: Hint rate,0.9
    - alpha: Hyperparameter,200
    - beta: Hyperparameter,20
    - lambda_: Hyperparameter,0.2
    - k: Hyperparameter,4
    - iterations: Iterations,10000

    Returns:
    - imputed_data: imputed data
    '''
    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    beta = gain_parameters['beta']
    lambda_ = gain_parameters['lambda_']
    k = gain_parameters['k']
    iterations = gain_parameters['iterations']
    cluster_species = gain_parameters['cluster_species']
    
    # Other parameters
    no, dim = incomplete_data_x.shape
    # Hidden state dimensions
    h_dim = int(dim)
    # Normalization
    norm_data , norm_parameters = normalization(incomplete_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)


    ## PC_GAIN architecture   
    X = tf.placeholder(tf.float32, shape = [None, dim])
    M = tf.placeholder(tf.float32, shape = [None, dim])
    H = tf.placeholder(tf.float32, shape = [None, dim])

    Z = tf.placeholder(tf.float32, shape = [None, dim])
    Y = tf.placeholder(tf.float32, shape = [None, k])
    
    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
    G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape = [dim]))
    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    C_W1 = tf.Variable(xavier_init([dim, h_dim]))
    C_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    C_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    C_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    C_W3 = tf.Variable(xavier_init([h_dim, k]))
    C_b3 = tf.Variable(tf.zeros(shape = [k]))  # 分类器
    theta_C = [C_W1, C_b1, C_W2, C_b2, C_W3, C_b3]
  
    ## PC_GAIN functions
    # Generator
    def generator(x,m):
        # Concatenate Mask and Data
        inputs = tf.concat(values = [x, m], axis = 1) 
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
        return G_prob
      
    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values = [x, h], axis = 1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob, D_logit
    
    # Classer (neural network classifier mentioned in the paper)
    def classer(feature):
        C_h1 = tf.nn.relu(tf.matmul(feature, C_W1) + C_b1)
        C_h2 = tf.nn.relu(tf.matmul(C_h1, C_W2) + C_b2)
        C_h3 = tf.matmul(C_h2, C_W3) + C_b3
        C_prob = tf.nn.softmax(C_h3)
        return C_prob  
  
    ## PC_GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1-M)

    # Discriminator
    D_prob, D_logit = discriminator(Hat_X, H)
    
    ## PC_GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8))
    G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
    G_loss_with_C = -tf.reduce_mean(Y * tf.log(Y + 1e-8))
    MSE_loss = tf.reduce_mean((M * X - M * G_sample) * (M * X - M * G_sample)) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss_pre = G_loss_temp + alpha * MSE_loss
    G_loss = G_loss_temp + alpha * MSE_loss + beta * G_loss_with_C
    
    ## PC_GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver_pre = tf.train.AdamOptimizer().minimize(G_loss_pre, var_list=theta_G)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
    
    ## Iterations
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        
        ##Select pre-training data
        loss_rate = []
        for i in range(no):
            index = 0
            for j in range(dim):
                if data_m[i,j] == 0:
                    index = index + 1
            loss_rate.append([index , i])
        loss_rate = sorted(loss_rate,key=(lambda x:x[0]))
        no_x_L = int(no * lambda_) 
        index_x_L = []
        for i in range(no_x_L):
            index_x_L.append(loss_rate[i][1])
        norm_data_x_L = norm_data_x[index_x_L, :]
        data_m_L = data_m[index_x_L, :]
        
        ##Pre-training
        print('...Pre-training')
        for it in tqdm(range(int(iterations * 0.7))):
            batch_idx = sample_batch_index(no_x_L, batch_size)
            X_mb = norm_data_x_L[batch_idx, :]
            M_mb = data_m_L[batch_idx, :]
            Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
            H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
            H_mb = M_mb * H_mb_temp
            X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
            _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb})
            _, G_loss_curr, MSE_loss_curr = sess.run([G_solver_pre, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H:H_mb})
            
        Z_mb = uniform_sampler(0, 0.01, no_x_L, dim) 
        M_mb = data_m_L
        X_mb = norm_data_x_L
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
        imputed_data_L = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
        imputed_data_L = data_m_L * norm_data_x_L + (1 - data_m_L) * imputed_data_L
        
        ## Select different clustering methods
        if cluster_species == 'KM':
            data_c , data_class = KM(imputed_data_L, k)
        elif cluster_species == 'SC':
            data_c , data_class = SC(imputed_data_L, k)
        elif cluster_species == 'AC':
            data_c , data_class = AC(imputed_data_L, k)
        elif cluster_species == 'KMPP': 
            data_c , data_class = KMPP(imputed_data_L, k)         
        else:
            exit('have not this cluster methods')
        
        ## Pseudo-label training multi-classification SVM
        ## You can also choose other classifiers, 
        ## such as the neural network classifier mentioned in the paper
        coder = preprocessing.OneHotEncoder()
        model = svm.SVC(kernel="linear", decision_function_shape="ovo")   
        coder.fit(data_class.reshape(-1,1))
        model.fit(imputed_data_L, data_class)
        
        ## Updata the generator G and the discriminator D
        ## To avoid the effects of pre-training, 
        ## you can also choose to reinitialize the generator parameters
        for it in tqdm(range(iterations)):
            batch_idx = sample_batch_index(no, batch_size)
            X_mb = norm_data_x[batch_idx, :]
            M_mb = data_m[batch_idx, :]
            Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
            H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
            H_mb = M_mb * H_mb_temp
            X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
            _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb})
            
            ## Introducing pseudo label supervision
            Hat_X_curr = sess.run(Hat_X, feed_dict = {X: X_mb, M: M_mb, H:H_mb})
            y_pred = model.predict(Hat_X_curr)
            sample_prob = coder.transform(y_pred.reshape(-1,1)).toarray()  
            
            _, G_loss_curr, MSE_loss_curr , G_loss_with_C_curr = sess.run([G_solver, G_loss_temp, MSE_loss, G_loss_with_C], feed_dict = {X: X_mb, M: M_mb, H:H_mb , Y:sample_prob})
            
        ## Return imputed data 
        Z_mb = uniform_sampler(0, 0.01, no, dim) 
        M_mb = data_m
        X_mb = norm_data_x          
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
        imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
        imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
        imputed_data = renormalization(imputed_data, norm_parameters)
    return imputed_data

    
    
      
Пример #13
0
def gain(data_x, gain_parameters, ori_data_x, train_index, test_index, mechanism):
    '''Impute missing values in data_x

    Args:
      - data_x: original data with missing values
      - gain_parameters: GAIN network parameters:
        - batch_size: Batch size
        - hint_rate: Hint rate
        - alpha: Hyper-parameter
        - iterations: Iterations

    Returns:
      - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    no_train = len(train_index)

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, nan=0)

    # pytorch.
    generator = Generator(dim, h_dim)
    discriminator = Discriminator(dim, h_dim)
    discriminator2 = Discriminator(dim, h_dim)

    # Optimizers
    generator_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
    discriminator_optimizer = torch.optim.SGD(discriminator.parameters(), lr=0.0001)
    discriminator2_optimizer = torch.optim.SGD(discriminator2.parameters(), lr=0.0001)
    for i in tqdm(range(iterations), desc='pytorch'):

        # if i % 2000 == 0 and i != 0:
        #     test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)

        # Sample batch
        batch_idx = sample_batch_index(no_train, batch_size)
        X_mb = norm_data_x[train_index][batch_idx, :]
        M_mb = data_m[train_index][batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors

        if mechanism == 'mar':
            H_mb = hint_for_mar(hint_rate, M_mb)  # np.logical_or(1 - h, 1 - mask)
            H_mb_2 = hint_for_mar(hint_rate, M_mb)  # np.logical_or(1 - h, 1 - mask)
        else:
            H_mb = M_mb * binary_sampler(hint_rate, batch_size, dim)
            H_mb_2 = M_mb * binary_sampler(hint_rate, batch_size, dim)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        X_mb = torch.Tensor(X_mb)
        M_mb = torch.Tensor(M_mb)
        H_mb = torch.Tensor(H_mb)
        H_mb_2 = torch.Tensor(H_mb_2)

        G_sample = generator(X_mb, M_mb)
        Hat_X = X_mb * M_mb + G_sample * (1 - M_mb)
        D_prob = discriminator(Hat_X, H_mb)
        D_prob2 = discriminator2(Hat_X, H_mb_2)

        d_loss_value = d_loss(M_mb, D_prob)
        d_loss_value2 = d_loss(M_mb, D_prob2)
        # y = torch.rand(1)
        g_loss_value = g_loss(M_mb, 0.5 * D_prob + 0.5 * D_prob2, alpha, X_mb, G_sample)

        discriminator2_optimizer.zero_grad()
        discriminator_optimizer.zero_grad()
        generator_optimizer.zero_grad()

        g_loss_value.backward(retain_graph=True)
        d_loss_value.backward(retain_graph=True)
        d_loss_value2.backward(retain_graph=True)

        generator_optimizer.step()
        discriminator_optimizer.step()
        discriminator2_optimizer.step()

    test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)
Пример #14
0
def gain(data_x, gain_parameters):
    '''Impute missing values in data_x

    Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
        - batch_size: Batch size
        - hint_rate: Hint rate
        - alpha: Hyperparameter
        - iterations: Iterations
        
    Returns:
    - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = (1 - np.isnan(data_x)).astype(float)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    # parameter initialization
    X = tf.convert_to_tensor(norm_data_x)
    X = tf.dtypes.cast(X, tf.float32)
    M = tf.convert_to_tensor(data_m)
    M = tf.dtypes.cast(M, tf.float32)
    X_input = tf.concat(values=[X, M], axis=1)

    ## GAIN architecture
    # Generator
    class Generator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    # Discriminator
    class Discriminator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    ## GAIN loss
    # Generator
    def generator_loss(generator, discriminator, x, m):
        generator.trainable = True
        discriminator.trainable = False
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        MSE_loss = tf.reduce_mean(
            (m * x - m * G_sample)**2) / tf.reduce_mean(m)
        D_input = tf.concat(values=[G_sample, m], axis=1)
        D_prob = discriminator(D_input)
        G_loss_tmp = -tf.reduce_mean((1 - m) * tf.math.log(D_prob + 1e-8))
        return G_loss_tmp + alpha * MSE_loss

    # Discriminator
    def discriminator_loss(generator, discriminator, x, m, h):
        generator.trainable = False
        discriminator.trainable = True
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        x_hat = x * m + G_sample * (1 - m)
        D_input = tf.concat(values=[x_hat, h], axis=1)
        D_prob = discriminator(D_input)
        return -tf.reduce_mean(m * tf.math.log(D_prob + 1e-8) \
                + (1-m) * tf.math.log(1. - D_prob + 1e-8))

    # Build
    generator = Generator()
    generator.build(input_shape=(None, 2 * dim))
    g_optimizer = tf.keras.optimizers.Adam()
    discriminator = Discriminator()
    discriminator.build(input_shape=(None, 2 * dim))
    d_optimizer = tf.keras.optimizers.Adam()

    # Training
    one_tensor = tf.constant(1., shape=(batch_size, dim), dtype=float)

    for _ in tqdm(range(iterations)):
        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = tf.gather(X, batch_idx)
        M_mb = tf.gather(M, batch_idx)
        Z_mb = tf.convert_to_tensor(uniform_sampler(0, 0.01, batch_size, dim),
                                    dtype=float)
        H_mb_tmp = tf.convert_to_tensor(binary_sampler(hint_rate, batch_size,
                                                       dim),
                                        dtype=float)
        H_mb = tf.math.multiply(M_mb, H_mb_tmp)

        # Combine random vectors with observed vectors
        # X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
        X_mb = tf.math.add(tf.math.multiply(M_mb, X_mb), \
                tf.math.multiply(tf.math.subtract(one_tensor, M_mb), Z_mb))

        # training Discriminator
        with tf.GradientTape() as tape:
            d_loss = discriminator_loss(generator, discriminator, X_mb, M_mb,
                                        H_mb)
        grads = tape.gradient(d_loss, discriminator.trainable_variables)
        d_optimizer.apply_gradients(
            zip(grads, discriminator.trainable_variables))

        # training Generator
        with tf.GradientTape() as tape:
            g_loss = generator_loss(generator, discriminator, X_mb, M_mb)
        grads = tape.gradient(g_loss, generator.trainable_variables)
        g_optimizer.apply_gradients(zip(grads, generator.trainable_variables))

    ## Return imputed data
    imputed_data = np.array([]).reshape(0, dim)
    train_data = tf.data.Dataset.from_tensor_slices(X_input).batch(batch_size)
    train_data_iter = iter(train_data)
    while True:
        try:
            batch = next(train_data_iter)
        except StopIteration:
            break
        X_tmp = generator(batch).numpy()
        imputed_data = np.vstack([imputed_data, X_tmp])

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Recovery
    imputed_data = data_m * np.nan_to_num(data_x) + (1 - data_m) * imputed_data

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Пример #15
0
from utils import sample_batch_index, binary_sampler
from tqdm import trange

if __name__ == '__main__':
    # Load data
    file_name = 'data/house.csv'
    house_df = pd.read_csv(file_name)
    no, dim = house_df.shape

    data_x = house_df.values.astype(np.float32)
    num_samples = 200

    miss_rate = 0.3
    for i in trange(num_samples):
        # random samples
        sample_idx = sample_batch_index(no, 10000)
        data_x_i = data_x[sample_idx, :]
        no_i, dim_i = data_x_i.shape
        np.savetxt("./samples/complete/sample_{}.csv".format(i),
                   data_x_i,
                   delimiter=",")

        # Introduce missing data
        data_m = binary_sampler(1 - miss_rate, no_i, dim_i)
        miss_data_x = data_x_i.copy()
        miss_data_x[data_m == 0] = np.nan
        np.savetxt("./samples/MCAR/sample_{}.csv".format(i),
                   miss_data_x,
                   delimiter=",")